From 0df8dbdfdf3b3b809bdf4cb38cf89e1345c746ae Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Tue, 21 Oct 2025 13:15:34 +0200 Subject: [PATCH 01/22] feat: add metadata model hierarchy Signed-off-by: Panos Vagenas --- docling_core/types/doc/document.py | 65 +++ docs/DoclingDocument.json | 354 ++++++++++++++++ examples/metadata.ipynb | 388 ++++++++++++++++++ test/data/doc/dummy_doc_with_meta.yaml | 285 +++++++++++++ .../doc/dummy_doc_with_meta_modified.yaml | 286 +++++++++++++ test/data/docling_document/unit/CodeItem.yaml | 1 + .../docling_document/unit/FloatingItem.yaml | 3 +- test/data/docling_document/unit/FormItem.yaml | 1 + .../docling_document/unit/FormulaItem.yaml | 1 + .../docling_document/unit/KeyValueItem.yaml | 1 + test/data/docling_document/unit/ListItem.yaml | 1 + .../docling_document/unit/PictureItem.yaml | 3 +- .../unit/SectionHeaderItem.yaml | 1 + .../data/docling_document/unit/TableItem.yaml | 1 + test/data/docling_document/unit/TextItem.yaml | 1 + .../data/docling_document/unit/TitleItem.yaml | 1 + test/test_metadata.py | 19 + 17 files changed, 1410 insertions(+), 2 deletions(-) create mode 100644 examples/metadata.ipynb create mode 100644 test/data/doc/dummy_doc_with_meta.yaml create mode 100644 test/data/doc/dummy_doc_with_meta_modified.yaml create mode 100644 test/test_metadata.py diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 45d8611b..2a466a52 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -941,6 +941,55 @@ class ContentLayer(str, Enum): DEFAULT_CONTENT_LAYERS = {ContentLayer.BODY} +class BaseMeta(BaseModel): + """Base class for metadata.""" + + model_config = ConfigDict(extra="allow") + + +class SummaryInstance(BaseModel): + """Single summary data point.""" + + text: str + confidence: Optional[float] = None + provenance: Optional[str] = None + + +class SummaryModel(BaseModel): + """Summary data.""" + + # convention: the first instance represents the main summary + instances: List[SummaryInstance] = Field(default_factory=list, min_length=1) + # NOTE: if needed, can add validator to coerce simpler forms to instances + + +class CommonMeta(BaseMeta): + """Common metadata model.""" + + summary: Optional[SummaryModel] = None + + +class PictureMeta(CommonMeta): + """Picture metadata model.""" + + # TODO the previous classes include "kind" for disambiguation, which is not needed here + classification: Optional[PictureClassificationData] = None + molecule: Optional[PictureMoleculeData] = None + tabular_chart: Optional[PictureTabularChartData] = None + line_chart: Optional[PictureLineChartData] = None + bar_chart: Optional[PictureBarChartData] = None + stacked_bar_chart: Optional[PictureStackedBarChartData] = None + pie_chart: Optional[PicturePieChartData] = None + scatter_chart: Optional[PictureScatterChartData] = None + + +class TableMeta(CommonMeta): + """Table metadata model.""" + + # TODO the previous classes include "kind" for disambiguation, which is not needed here + description: Optional[DescriptionAnnotation] = None + + class NodeItem(BaseModel): """NodeItem.""" @@ -952,6 +1001,8 @@ class NodeItem(BaseModel): model_config = ConfigDict(extra="forbid") + meta: Optional[BaseMeta] = None + def get_ref(self) -> RefItem: """get_ref.""" return RefItem(cref=self.self_ref) @@ -1048,6 +1099,8 @@ def _add_sibling( class GroupItem(NodeItem): # Container type, can't be a leaf node """GroupItem.""" + meta: Optional[CommonMeta] = None + name: str = ( "group" # Name of the group, e.g. "Introduction Chapter", # "Slide 5", "Navigation menu list", ... @@ -1098,6 +1151,7 @@ class DocItem( label: DocItemLabel prov: List[ProvenanceItem] = [] + meta: Optional[CommonMeta] = None def get_location_tokens( self, @@ -1407,6 +1461,7 @@ class PictureItem(FloatingItem): ) annotations: List[PictureDataType] = [] + meta: Optional[PictureMeta] = None # Convert the image to Base64 def _image_to_base64(self, pil_image, format="PNG"): @@ -1555,6 +1610,7 @@ class TableItem(FloatingItem): ] = DocItemLabel.TABLE annotations: List[TableAnnotationType] = [] + meta: Optional[TableMeta] = None def export_to_dataframe( self, doc: Optional["DoclingDocument"] = None @@ -5746,6 +5802,13 @@ def concatenate(cls, docs: Sequence["DoclingDocument"]) -> "DoclingDocument": return res_doc def _validate_rules(self): + + def validate_furniture(doc: DoclingDocument): + if doc.furniture.children: + raise ValueError( + f"Deprecated furniture node {doc.furniture.self_ref} has children" + ) + def validate_list_group(doc: DoclingDocument, item: ListGroup): for ref in item.children: child = ref.resolve(doc) @@ -5768,6 +5831,8 @@ def validate_group(doc: DoclingDocument, item: GroupItem): ): # tolerate empty body, but not other groups raise ValueError(f"Group {item.self_ref} has no children") + validate_furniture(self) + for item, _ in self.iterate_items( with_groups=True, traverse_pictures=True, diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index 305f5a9b..22e468f9 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -194,6 +194,17 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "meta": { + "anyOf": [ + { + "$ref": "#/$defs/CommonMeta" + }, + { + "type": "null" + } + ], + "default": null + }, "label": { "const": "code", "default": "code", @@ -357,6 +368,25 @@ "title": "CodeLanguageLabel", "type": "string" }, + "CommonMeta": { + "additionalProperties": true, + "description": "Common metadata model.", + "properties": { + "summary": { + "anyOf": [ + { + "$ref": "#/$defs/SummaryModel" + }, + { + "type": "null" + } + ], + "default": null + } + }, + "title": "CommonMeta", + "type": "object" + }, "ContentLayer": { "description": "ContentLayer.", "enum": [ @@ -475,6 +505,17 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "meta": { + "anyOf": [ + { + "$ref": "#/$defs/CommonMeta" + }, + { + "type": "null" + } + ], + "default": null + }, "label": { "const": "form", "default": "form", @@ -598,6 +639,17 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "meta": { + "anyOf": [ + { + "$ref": "#/$defs/CommonMeta" + }, + { + "type": "null" + } + ], + "default": null + }, "label": { "const": "formula", "default": "formula", @@ -807,6 +859,17 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "meta": { + "anyOf": [ + { + "$ref": "#/$defs/CommonMeta" + }, + { + "type": "null" + } + ], + "default": null + }, "name": { "default": "group", "title": "Name", @@ -912,6 +975,17 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "meta": { + "anyOf": [ + { + "$ref": "#/$defs/CommonMeta" + }, + { + "type": "null" + } + ], + "default": null + }, "name": { "default": "group", "title": "Name", @@ -962,6 +1036,17 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "meta": { + "anyOf": [ + { + "$ref": "#/$defs/CommonMeta" + }, + { + "type": "null" + } + ], + "default": null + }, "label": { "const": "key_value_region", "default": "key_value_region", @@ -1054,6 +1139,17 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "meta": { + "anyOf": [ + { + "$ref": "#/$defs/CommonMeta" + }, + { + "type": "null" + } + ], + "default": null + }, "name": { "default": "group", "title": "Name", @@ -1104,6 +1200,17 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "meta": { + "anyOf": [ + { + "$ref": "#/$defs/CommonMeta" + }, + { + "type": "null" + } + ], + "default": null + }, "label": { "const": "list_item", "default": "list_item", @@ -1341,6 +1448,17 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "meta": { + "anyOf": [ + { + "$ref": "#/$defs/PictureMeta" + }, + { + "type": "null" + } + ], + "default": null + }, "label": { "default": "picture", "enum": [ @@ -1492,6 +1610,113 @@ "title": "PictureLineChartData", "type": "object" }, + "PictureMeta": { + "additionalProperties": true, + "description": "Picture metadata model.", + "properties": { + "summary": { + "anyOf": [ + { + "$ref": "#/$defs/SummaryModel" + }, + { + "type": "null" + } + ], + "default": null + }, + "classification": { + "anyOf": [ + { + "$ref": "#/$defs/PictureClassificationData" + }, + { + "type": "null" + } + ], + "default": null + }, + "molecule": { + "anyOf": [ + { + "$ref": "#/$defs/PictureMoleculeData" + }, + { + "type": "null" + } + ], + "default": null + }, + "tabular_chart": { + "anyOf": [ + { + "$ref": "#/$defs/PictureTabularChartData" + }, + { + "type": "null" + } + ], + "default": null + }, + "line_chart": { + "anyOf": [ + { + "$ref": "#/$defs/PictureLineChartData" + }, + { + "type": "null" + } + ], + "default": null + }, + "bar_chart": { + "anyOf": [ + { + "$ref": "#/$defs/PictureBarChartData" + }, + { + "type": "null" + } + ], + "default": null + }, + "stacked_bar_chart": { + "anyOf": [ + { + "$ref": "#/$defs/PictureStackedBarChartData" + }, + { + "type": "null" + } + ], + "default": null + }, + "pie_chart": { + "anyOf": [ + { + "$ref": "#/$defs/PicturePieChartData" + }, + { + "type": "null" + } + ], + "default": null + }, + "scatter_chart": { + "anyOf": [ + { + "$ref": "#/$defs/PictureScatterChartData" + }, + { + "type": "null" + } + ], + "default": null + } + }, + "title": "PictureMeta", + "type": "object" + }, "PictureMoleculeData": { "description": "PictureMoleculeData.", "properties": { @@ -1842,6 +2067,17 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "meta": { + "anyOf": [ + { + "$ref": "#/$defs/CommonMeta" + }, + { + "type": "null" + } + ], + "default": null + }, "label": { "const": "section_header", "default": "section_header", @@ -1926,6 +2162,59 @@ "title": "Size", "type": "object" }, + "SummaryInstance": { + "description": "Single summary data point.", + "properties": { + "text": { + "title": "Text", + "type": "string" + }, + "confidence": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Confidence" + }, + "provenance": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Provenance" + } + }, + "required": [ + "text" + ], + "title": "SummaryInstance", + "type": "object" + }, + "SummaryModel": { + "description": "Summary data.", + "properties": { + "instances": { + "items": { + "$ref": "#/$defs/SummaryInstance" + }, + "minItems": 1, + "title": "Instances", + "type": "array" + } + }, + "title": "SummaryModel", + "type": "object" + }, "TableCell": { "description": "TableCell.", "properties": { @@ -2065,6 +2354,17 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "meta": { + "anyOf": [ + { + "$ref": "#/$defs/TableMeta" + }, + { + "type": "null" + } + ], + "default": null + }, "label": { "default": "table", "enum": [ @@ -2150,6 +2450,36 @@ "title": "TableItem", "type": "object" }, + "TableMeta": { + "additionalProperties": true, + "description": "Table metadata model.", + "properties": { + "summary": { + "anyOf": [ + { + "$ref": "#/$defs/SummaryModel" + }, + { + "type": "null" + } + ], + "default": null + }, + "description": { + "anyOf": [ + { + "$ref": "#/$defs/DescriptionAnnotation" + }, + { + "type": "null" + } + ], + "default": null + } + }, + "title": "TableMeta", + "type": "object" + }, "TextItem": { "additionalProperties": false, "description": "TextItem.", @@ -2182,6 +2512,17 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "meta": { + "anyOf": [ + { + "$ref": "#/$defs/CommonMeta" + }, + { + "type": "null" + } + ], + "default": null + }, "label": { "enum": [ "caption", @@ -2285,6 +2626,17 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "meta": { + "anyOf": [ + { + "$ref": "#/$defs/CommonMeta" + }, + { + "type": "null" + } + ], + "default": null + }, "label": { "const": "title", "default": "title", @@ -2382,6 +2734,7 @@ "parent": null, "children": [], "content_layer": "furniture", + "meta": null, "name": "_root_", "label": "unspecified" }, @@ -2394,6 +2747,7 @@ "parent": null, "children": [], "content_layer": "body", + "meta": null, "name": "_root_", "label": "unspecified" } diff --git a/examples/metadata.ipynb b/examples/metadata.ipynb new file mode 100644 index 00000000..bb70a469 --- /dev/null +++ b/examples/metadata.ipynb @@ -0,0 +1,388 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "e638ac23", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
───────────────────────────────────────────────────── #/body ──────────────────────────────────────────────────────\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[92m───────────────────────────────────────────────────── \u001b[0m\u001b[1m#\u001b[0m\u001b[1;35m/\u001b[0m\u001b[1;95mbody\u001b[0m\u001b[92m ──────────────────────────────────────────────────────\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
──────────────────────────────────────────────────── #/texts/1 ────────────────────────────────────────────────────\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[92m──────────────────────────────────────────────────── \u001b[0m\u001b[1m#\u001b[0m\u001b[1;35m/texts/\u001b[0m\u001b[1;95m1\u001b[0m\u001b[92m ────────────────────────────────────────────────────\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
item.meta:\n",
+       "
\n" + ], + "text/plain": [ + "item.meta:\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
CommonMeta(\n",
+       "    summary=SummaryModel(\n",
+       "        instances=[\n",
+       "            SummaryInstance(text='This is a title.', confidence=0.95, provenance='model1'),\n",
+       "            SummaryInstance(text='This is a figure.', confidence=0.42, provenance='model2')\n",
+       "        ]\n",
+       "    ),\n",
+       "    example_custom_field_1='More stuff here.'\n",
+       ")\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;35mCommonMeta\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33msummary\u001b[0m=\u001b[1;35mSummaryModel\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33minstances\u001b[0m=\u001b[1m[\u001b[0m\n", + " \u001b[1;35mSummaryInstance\u001b[0m\u001b[1m(\u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'This is a title.'\u001b[0m, \u001b[33mconfidence\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.95\u001b[0m, \u001b[33mprovenance\u001b[0m=\u001b[32m'model1'\u001b[0m\u001b[1m)\u001b[0m,\n", + " \u001b[1;35mSummaryInstance\u001b[0m\u001b[1m(\u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'This is a figure.'\u001b[0m, \u001b[33mconfidence\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.42\u001b[0m, \u001b[33mprovenance\u001b[0m=\u001b[32m'model2'\u001b[0m\u001b[1m)\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", + " \u001b[1m)\u001b[0m,\n", + " \u001b[33mexample_custom_field_1\u001b[0m=\u001b[32m'More stuff here.'\u001b[0m\n", + "\u001b[1m)\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
item.meta.__pydantic_extra__:\n",
+       "
\n" + ], + "text/plain": [ + "item.meta.__pydantic_extra__:\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
{'example_custom_field_1': 'More stuff here.'}\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m{\u001b[0m\u001b[32m'example_custom_field_1'\u001b[0m: \u001b[32m'More stuff here.'\u001b[0m\u001b[1m}\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
────────────────────────────────────────────────── #/pictures/0 ───────────────────────────────────────────────────\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[92m────────────────────────────────────────────────── \u001b[0m\u001b[1m#\u001b[0m\u001b[1;35m/pictures/\u001b[0m\u001b[1;95m0\u001b[0m\u001b[92m ───────────────────────────────────────────────────\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
item.meta:\n",
+       "
\n" + ], + "text/plain": [ + "item.meta:\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
PictureMeta(\n",
+       "    summary=None,\n",
+       "    classification=PictureClassificationData(\n",
+       "        kind='classification',\n",
+       "        provenance='model1',\n",
+       "        predicted_classes=[PictureClassificationClass(class_name='bar_chart', confidence=0.78)]\n",
+       "    ),\n",
+       "    molecule=PictureMoleculeData(\n",
+       "        kind='molecule_data',\n",
+       "        smi='CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1',\n",
+       "        confidence=0.98,\n",
+       "        class_name='chemistry_molecular_structure',\n",
+       "        segmentation=[(0.0, 0.0), (1.0, 0.0), (0.0, 1.0), (1.0, 1.0)],\n",
+       "        provenance='model3-1.0.0'\n",
+       "    ),\n",
+       "    tabular_chart=None,\n",
+       "    line_chart=None,\n",
+       "    bar_chart=None,\n",
+       "    stacked_bar_chart=None,\n",
+       "    pie_chart=None,\n",
+       "    scatter_chart=None,\n",
+       "    example_custom_description_field={'provenance': 'model2', 'text': '...'},\n",
+       "    example_custom_misc_field={\n",
+       "        'content': {'myanalysis': {'prediction': 'abc'}, 'something_else': {'text': 'aaa'}}\n",
+       "    }\n",
+       ")\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;35mPictureMeta\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33msummary\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", + " \u001b[33mclassification\u001b[0m=\u001b[1;35mPictureClassificationData\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33mkind\u001b[0m=\u001b[32m'classification'\u001b[0m,\n", + " \u001b[33mprovenance\u001b[0m=\u001b[32m'model1'\u001b[0m,\n", + " \u001b[33mpredicted_classes\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mPictureClassificationClass\u001b[0m\u001b[1m(\u001b[0m\u001b[33mclass_name\u001b[0m=\u001b[32m'bar_chart'\u001b[0m, \u001b[33mconfidence\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.78\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m\n", + " \u001b[1m)\u001b[0m,\n", + " \u001b[33mmolecule\u001b[0m=\u001b[1;35mPictureMoleculeData\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33mkind\u001b[0m=\u001b[32m'molecule_data'\u001b[0m,\n", + " \u001b[33msmi\u001b[0m=\u001b[32m'\u001b[0m\u001b[32mCC1\u001b[0m\u001b[32m=\u001b[0m\u001b[32mNNC\u001b[0m\u001b[32m(\u001b[0m\u001b[32mC2\u001b[0m\u001b[32m=\u001b[0m\u001b[32mCN3C\u001b[0m\u001b[32m=\u001b[0m\u001b[32mCN\u001b[0m\u001b[32m=\u001b[0m\u001b[32mC3C\u001b[0m\u001b[32m(\u001b[0m\u001b[32mCC3\u001b[0m\u001b[32m=\u001b[0m\u001b[32mCC\u001b[0m\u001b[32m(\u001b[0m\u001b[32mF\u001b[0m\u001b[32m)\u001b[0m\u001b[32m=CC\u001b[0m\u001b[32m(\u001b[0m\u001b[32mF\u001b[0m\u001b[32m)\u001b[0m\u001b[32m=C3\u001b[0m\u001b[32m)\u001b[0m\u001b[32m=N2\u001b[0m\u001b[32m)\u001b[0m\u001b[32m=N1'\u001b[0m,\n", + " \u001b[33mconfidence\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.98\u001b[0m,\n", + " \u001b[33mclass_name\u001b[0m=\u001b[32m'chemistry_molecular_structure'\u001b[0m,\n", + " \u001b[33msegmentation\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m0.0\u001b[0m, \u001b[1;36m0.0\u001b[0m\u001b[1m)\u001b[0m, \u001b[1m(\u001b[0m\u001b[1;36m1.0\u001b[0m, \u001b[1;36m0.0\u001b[0m\u001b[1m)\u001b[0m, \u001b[1m(\u001b[0m\u001b[1;36m0.0\u001b[0m, \u001b[1;36m1.0\u001b[0m\u001b[1m)\u001b[0m, \u001b[1m(\u001b[0m\u001b[1;36m1.0\u001b[0m, \u001b[1;36m1.0\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n", + " \u001b[33mprovenance\u001b[0m=\u001b[32m'model3-1.0.0'\u001b[0m\n", + " \u001b[1m)\u001b[0m,\n", + " \u001b[33mtabular_chart\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", + " \u001b[33mline_chart\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", + " \u001b[33mbar_chart\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", + " \u001b[33mstacked_bar_chart\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", + " \u001b[33mpie_chart\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", + " \u001b[33mscatter_chart\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", + " \u001b[33mexample_custom_description_field\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'provenance'\u001b[0m: \u001b[32m'model2'\u001b[0m, \u001b[32m'text'\u001b[0m: \u001b[32m'...'\u001b[0m\u001b[1m}\u001b[0m,\n", + " \u001b[33mexample_custom_misc_field\u001b[0m=\u001b[1m{\u001b[0m\n", + " \u001b[32m'content'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'myanalysis'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'prediction'\u001b[0m: \u001b[32m'abc'\u001b[0m\u001b[1m}\u001b[0m, \u001b[32m'something_else'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'text'\u001b[0m: \u001b[32m'aaa'\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + "\u001b[1m)\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
item.meta.__pydantic_extra__:\n",
+       "
\n" + ], + "text/plain": [ + "item.meta.__pydantic_extra__:\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
{\n",
+       "    'example_custom_description_field': {'provenance': 'model2', 'text': '...'},\n",
+       "    'example_custom_misc_field': {\n",
+       "        'content': {'myanalysis': {'prediction': 'abc'}, 'something_else': {'text': 'aaa'}}\n",
+       "    }\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m{\u001b[0m\n", + " \u001b[32m'example_custom_description_field'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'provenance'\u001b[0m: \u001b[32m'model2'\u001b[0m, \u001b[32m'text'\u001b[0m: \u001b[32m'...'\u001b[0m\u001b[1m}\u001b[0m,\n", + " \u001b[32m'example_custom_misc_field'\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[32m'content'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'myanalysis'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'prediction'\u001b[0m: \u001b[32m'abc'\u001b[0m\u001b[1m}\u001b[0m, \u001b[32m'something_else'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'text'\u001b[0m: \u001b[32m'aaa'\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + "\u001b[1m}\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
──────────────────────────────────────────────────── #/texts/2 ────────────────────────────────────────────────────\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[92m──────────────────────────────────────────────────── \u001b[0m\u001b[1m#\u001b[0m\u001b[1;35m/texts/\u001b[0m\u001b[1;95m2\u001b[0m\u001b[92m ────────────────────────────────────────────────────\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
item.meta:\n",
+       "
\n" + ], + "text/plain": [ + "item.meta:\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
CommonMeta(\n",
+       "    summary=SummaryModel(\n",
+       "        instances=[SummaryInstance(text='This is a section header.', confidence=None, provenance=None)]\n",
+       "    ),\n",
+       "    example_custom_field_added_programmaticaly=True\n",
+       ")\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;35mCommonMeta\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33msummary\u001b[0m=\u001b[1;35mSummaryModel\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33minstances\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mSummaryInstance\u001b[0m\u001b[1m(\u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'This is a section header.'\u001b[0m, \u001b[33mconfidence\u001b[0m=\u001b[3;35mNone\u001b[0m, \u001b[33mprovenance\u001b[0m=\u001b[3;35mNone\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m\n", + " \u001b[1m)\u001b[0m,\n", + " \u001b[33mexample_custom_field_added_programmaticaly\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", + "\u001b[1m)\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
item.meta.__pydantic_extra__:\n",
+       "
\n" + ], + "text/plain": [ + "item.meta.__pydantic_extra__:\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
{'example_custom_field_added_programmaticaly': True}\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m{\u001b[0m\u001b[32m'example_custom_field_added_programmaticaly'\u001b[0m: \u001b[3;92mTrue\u001b[0m\u001b[1m}\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
──────────────────────────────────────────────────── #/texts/3 ────────────────────────────────────────────────────\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[92m──────────────────────────────────────────────────── \u001b[0m\u001b[1m#\u001b[0m\u001b[1;35m/texts/\u001b[0m\u001b[1;95m3\u001b[0m\u001b[92m ────────────────────────────────────────────────────\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
─────────────────────────────────────────────────── #/tables/0 ────────────────────────────────────────────────────\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[92m─────────────────────────────────────────────────── \u001b[0m\u001b[1m#\u001b[0m\u001b[1;35m/tables/\u001b[0m\u001b[1;95m0\u001b[0m\u001b[92m ────────────────────────────────────────────────────\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from pathlib import Path\n", + "from docling_core.types.doc.document import ContentLayer, DoclingDocument\n", + "from rich.console import Console\n", + "import os\n", + "\n", + "source_path = Path(\"../test/data/doc/dummy_doc_with_meta.yaml\")\n", + "\n", + "doc = DoclingDocument.load_from_yaml(filename=source_path)\n", + "# doc._validate_rules()\n", + "\n", + "console = Console()\n", + "\n", + "for item, i in doc.iterate_items(\n", + " included_content_layers={cl for cl in ContentLayer}, with_groups=True, traverse_pictures=True):\n", + " console.rule(f\"[bold]{item.self_ref}\")\n", + " if item.meta:\n", + "\n", + " # showcasing adding a new field programmatically\n", + " if item.self_ref == \"#/texts/2\":\n", + " item.meta.example_custom_field_added_programmaticaly = True\n", + "\n", + " console.print(\"item.meta:\")\n", + " console.print(item.meta)\n", + " console.print(\"item.meta.__pydantic_extra__:\")\n", + " console.print(item.meta.__pydantic_extra__)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c65273e7", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/test/data/doc/dummy_doc_with_meta.yaml b/test/data/doc/dummy_doc_with_meta.yaml new file mode 100644 index 00000000..7ce87b77 --- /dev/null +++ b/test/data/doc/dummy_doc_with_meta.yaml @@ -0,0 +1,285 @@ +body: + children: + - $ref: '#/texts/1' + - $ref: '#/pictures/0' + - $ref: '#/texts/3' + - $ref: '#/tables/0' + content_layer: body + label: unspecified + name: _root_ + self_ref: '#/body' +form_items: [] +furniture: + children: + - $ref: '#/texts/0' + content_layer: body + label: unspecified + name: _root_ + self_ref: '#/furniture' +groups: [] +key_value_items: [] +name: dummy_doc +origin: + binary_hash: 7954723514066505909 + filename: dummy_doc.pdf + mimetype: application/pdf +pages: + '1': + image: + dpi: 144 + mimetype: image/png + size: + height: 1166.0 + width: 1536.0 + uri:  + page_no: 1 + size: + height: 583.15 + width: 768.23 +pictures: +- annotations: + - kind: classification + predicted_classes: + - class_name: bar_chart + confidence: 0.78 + provenance: model1 + - kind: description + provenance: model2 + text: '...' + - class_name: chemistry_molecular_structure + confidence: 0.98 + kind: molecule_data + provenance: model3-1.0.0 + segmentation: + - - 0.0 + - 0.0 + - - 1.0 + - 0.0 + - - 0.0 + - 1.0 + - - 1.0 + - 1.0 + smi: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 + - content: + myanalysis: + prediction: abc + something_else: + text: aaa + kind: misc + captions: + - $ref: '#/texts/3' + children: + - $ref: '#/texts/2' + content_layer: body + footnotes: [] + image: + dpi: 72 + mimetype: image/png + size: + height: 351.0 + width: 231.0 + uri:  + label: picture + meta: + classification: + kind: classification + predicted_classes: + - class_name: bar_chart + confidence: 0.78 + provenance: model1 + example_custom_description_field: + provenance: model2 + text: '...' + example_custom_misc_field: + content: + myanalysis: + prediction: abc + something_else: + text: aaa + molecule: + class_name: chemistry_molecular_structure + confidence: 0.98 + kind: molecule_data + provenance: model3-1.0.0 + segmentation: + - - 0.0 + - 0.0 + - - 1.0 + - 0.0 + - - 0.0 + - 1.0 + - - 1.0 + - 1.0 + smi: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 + parent: + $ref: '#/body' + prov: + - bbox: + b: 623.4 + coord_origin: TOPLEFT + l: 456.3 + r: 702.5 + t: 145.8 + charspan: + - 0 + - 288 + page_no: 1 + references: [] + self_ref: '#/pictures/0' +- annotations: [] + captions: [] + children: [] + content_layer: body + footnotes: [] + image: + dpi: 72 + mimetype: image/png + size: + height: 2.0 + width: 2.0 + uri:  + label: picture + parent: + $ref: '#/body' + prov: [] + references: [] + self_ref: '#/pictures/1' +schema_name: DoclingDocument +tables: +- annotations: + - kind: description + provenance: model3 + text: A description annotation for this table. + - content: + foo: bar + kind: misc + captions: [] + children: [] + content_layer: body + data: + grid: [] + num_cols: 0 + num_rows: 0 + table_cells: [] + footnotes: [] + image: + dpi: 72 + mimetype: image/png + size: + height: 351.0 + width: 231.0 + uri:  + label: table + parent: + $ref: '#/body' + prov: + - bbox: + b: 334.4 + coord_origin: BOTTOMLEFT + l: 323.0 + r: 376.0 + t: 354.3 + charspan: + - 1 + - 423 + page_no: 1 + references: [] + self_ref: '#/tables/0' +texts: +- children: [] + content_layer: body + label: page_header + orig: arXiv:2206.01062v1 [cs.CV] 2 Jun 2022 + parent: + $ref: '#/furniture' + prov: + - bbox: + b: 476.2 + coord_origin: TOPLEFT + l: 21.3 + r: 35.2 + t: 52.3 + charspan: + - 1 + - 423 + page_no: 1 + self_ref: '#/texts/0' + text: arXiv:2206.01062v1 [cs.CV] 2 Jun 2022 +- children: [] + content_layer: body + label: title + meta: + example_custom_field_1: More stuff here. + summary: + instances: + - confidence: 0.95 + provenance: model1 + text: This is a title. + - confidence: 0.42 + provenance: model2 + text: This is a figure. + orig: 'DocLayNet: A Large Human-Annotated Dataset for + + Document-Layout Analysis' + parent: + $ref: '#/body' + prov: + - bbox: + b: 53.4 + coord_origin: TOPLEFT + l: 65.0 + r: 623.2 + t: 30.1 + charspan: + - 1 + - 423 + page_no: 1 + self_ref: '#/texts/1' + text: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis' +- children: [] + content_layer: body + label: section_header + level: 1 + meta: + summary: + instances: + - text: This is a section header. + orig: OPERATION (cont.) + parent: + $ref: '#/pictures/0' + prov: + - bbox: + b: 334.4 + coord_origin: BOTTOMLEFT + l: 323.0 + r: 376.0 + t: 354.3 + charspan: + - 0 + - 734 + page_no: 1 + self_ref: '#/texts/2' + text: OPERATION (cont.) +- children: [] + content_layer: body + label: caption + orig: 'Figure 1: Four examples of complex page layouts across dif- + + ferent document categories' + parent: + $ref: '#/body' + prov: + - bbox: + b: 334.4 + coord_origin: BOTTOMLEFT + l: 323.0 + r: 376.0 + t: 354.3 + charspan: + - 1 + - 423 + page_no: 1 + self_ref: '#/texts/3' + text: 'Figure 1: Four examples of complex page layouts across different document + categories' +version: 1.7.0 diff --git a/test/data/doc/dummy_doc_with_meta_modified.yaml b/test/data/doc/dummy_doc_with_meta_modified.yaml new file mode 100644 index 00000000..ff5887cd --- /dev/null +++ b/test/data/doc/dummy_doc_with_meta_modified.yaml @@ -0,0 +1,286 @@ +body: + children: + - $ref: '#/texts/1' + - $ref: '#/pictures/0' + - $ref: '#/texts/3' + - $ref: '#/tables/0' + content_layer: body + label: unspecified + name: _root_ + self_ref: '#/body' +form_items: [] +furniture: + children: + - $ref: '#/texts/0' + content_layer: body + label: unspecified + name: _root_ + self_ref: '#/furniture' +groups: [] +key_value_items: [] +name: dummy_doc +origin: + binary_hash: 7954723514066505909 + filename: dummy_doc.pdf + mimetype: application/pdf +pages: + '1': + image: + dpi: 144 + mimetype: image/png + size: + height: 1166.0 + width: 1536.0 + uri:  + page_no: 1 + size: + height: 583.15 + width: 768.23 +pictures: +- annotations: + - kind: classification + predicted_classes: + - class_name: bar_chart + confidence: 0.78 + provenance: model1 + - kind: description + provenance: model2 + text: '...' + - class_name: chemistry_molecular_structure + confidence: 0.98 + kind: molecule_data + provenance: model3-1.0.0 + segmentation: + - - 0.0 + - 0.0 + - - 1.0 + - 0.0 + - - 0.0 + - 1.0 + - - 1.0 + - 1.0 + smi: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 + - content: + myanalysis: + prediction: abc + something_else: + text: aaa + kind: misc + captions: + - $ref: '#/texts/3' + children: + - $ref: '#/texts/2' + content_layer: body + footnotes: [] + image: + dpi: 72 + mimetype: image/png + size: + height: 351.0 + width: 231.0 + uri:  + label: picture + meta: + classification: + kind: classification + predicted_classes: + - class_name: bar_chart + confidence: 0.78 + provenance: model1 + example_custom_description_field: + provenance: model2 + text: '...' + example_custom_misc_field: + content: + myanalysis: + prediction: abc + something_else: + text: aaa + molecule: + class_name: chemistry_molecular_structure + confidence: 0.98 + kind: molecule_data + provenance: model3-1.0.0 + segmentation: + - - 0.0 + - 0.0 + - - 1.0 + - 0.0 + - - 0.0 + - 1.0 + - - 1.0 + - 1.0 + smi: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 + parent: + $ref: '#/body' + prov: + - bbox: + b: 623.4 + coord_origin: TOPLEFT + l: 456.3 + r: 702.5 + t: 145.8 + charspan: + - 0 + - 288 + page_no: 1 + references: [] + self_ref: '#/pictures/0' +- annotations: [] + captions: [] + children: [] + content_layer: body + footnotes: [] + image: + dpi: 72 + mimetype: image/png + size: + height: 2.0 + width: 2.0 + uri:  + label: picture + parent: + $ref: '#/body' + prov: [] + references: [] + self_ref: '#/pictures/1' +schema_name: DoclingDocument +tables: +- annotations: + - kind: description + provenance: model3 + text: A description annotation for this table. + - content: + foo: bar + kind: misc + captions: [] + children: [] + content_layer: body + data: + grid: [] + num_cols: 0 + num_rows: 0 + table_cells: [] + footnotes: [] + image: + dpi: 72 + mimetype: image/png + size: + height: 351.0 + width: 231.0 + uri:  + label: table + parent: + $ref: '#/body' + prov: + - bbox: + b: 334.4 + coord_origin: BOTTOMLEFT + l: 323.0 + r: 376.0 + t: 354.3 + charspan: + - 1 + - 423 + page_no: 1 + references: [] + self_ref: '#/tables/0' +texts: +- children: [] + content_layer: body + label: page_header + orig: arXiv:2206.01062v1 [cs.CV] 2 Jun 2022 + parent: + $ref: '#/furniture' + prov: + - bbox: + b: 476.2 + coord_origin: TOPLEFT + l: 21.3 + r: 35.2 + t: 52.3 + charspan: + - 1 + - 423 + page_no: 1 + self_ref: '#/texts/0' + text: arXiv:2206.01062v1 [cs.CV] 2 Jun 2022 +- children: [] + content_layer: body + label: title + meta: + example_custom_field_1: More stuff here. + summary: + instances: + - confidence: 0.95 + provenance: model1 + text: This is a title. + - confidence: 0.42 + provenance: model2 + text: This is a figure. + orig: 'DocLayNet: A Large Human-Annotated Dataset for + + Document-Layout Analysis' + parent: + $ref: '#/body' + prov: + - bbox: + b: 53.4 + coord_origin: TOPLEFT + l: 65.0 + r: 623.2 + t: 30.1 + charspan: + - 1 + - 423 + page_no: 1 + self_ref: '#/texts/1' + text: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis' +- children: [] + content_layer: body + label: section_header + level: 1 + meta: + example_custom_field_added_programmaticaly: true + summary: + instances: + - text: This is a section header. + orig: OPERATION (cont.) + parent: + $ref: '#/pictures/0' + prov: + - bbox: + b: 334.4 + coord_origin: BOTTOMLEFT + l: 323.0 + r: 376.0 + t: 354.3 + charspan: + - 0 + - 734 + page_no: 1 + self_ref: '#/texts/2' + text: OPERATION (cont.) +- children: [] + content_layer: body + label: caption + orig: 'Figure 1: Four examples of complex page layouts across dif- + + ferent document categories' + parent: + $ref: '#/body' + prov: + - bbox: + b: 334.4 + coord_origin: BOTTOMLEFT + l: 323.0 + r: 376.0 + t: 354.3 + charspan: + - 1 + - 423 + page_no: 1 + self_ref: '#/texts/3' + text: 'Figure 1: Four examples of complex page layouts across different document + categories' +version: 1.7.0 diff --git a/test/data/docling_document/unit/CodeItem.yaml b/test/data/docling_document/unit/CodeItem.yaml index 09995640..c263a4f4 100644 --- a/test/data/docling_document/unit/CodeItem.yaml +++ b/test/data/docling_document/unit/CodeItem.yaml @@ -13,3 +13,4 @@ self_ref: '#' text: print(Hello World!) formatting: null hyperlink: null +meta: null diff --git a/test/data/docling_document/unit/FloatingItem.yaml b/test/data/docling_document/unit/FloatingItem.yaml index 21beef40..0c11c8f3 100644 --- a/test/data/docling_document/unit/FloatingItem.yaml +++ b/test/data/docling_document/unit/FloatingItem.yaml @@ -7,4 +7,5 @@ parent: null prov: [] references: [] self_ref: '#' -content_layer: body \ No newline at end of file +content_layer: body +meta: null diff --git a/test/data/docling_document/unit/FormItem.yaml b/test/data/docling_document/unit/FormItem.yaml index af7a61e1..f296d801 100644 --- a/test/data/docling_document/unit/FormItem.yaml +++ b/test/data/docling_document/unit/FormItem.yaml @@ -25,6 +25,7 @@ graph: target_cell_id: 0 image: null label: form +meta: null parent: null prov: [] references: [] diff --git a/test/data/docling_document/unit/FormulaItem.yaml b/test/data/docling_document/unit/FormulaItem.yaml index 25057908..cd631ff5 100644 --- a/test/data/docling_document/unit/FormulaItem.yaml +++ b/test/data/docling_document/unit/FormulaItem.yaml @@ -8,3 +8,4 @@ text: E=mc^2 content_layer: body formatting: null hyperlink: null +meta: null diff --git a/test/data/docling_document/unit/KeyValueItem.yaml b/test/data/docling_document/unit/KeyValueItem.yaml index 219e951e..f6db93e1 100644 --- a/test/data/docling_document/unit/KeyValueItem.yaml +++ b/test/data/docling_document/unit/KeyValueItem.yaml @@ -25,6 +25,7 @@ graph: target_cell_id: 0 image: null label: key_value_region +meta: null parent: null prov: [] references: [] diff --git a/test/data/docling_document/unit/ListItem.yaml b/test/data/docling_document/unit/ListItem.yaml index 20d8de90..300661d3 100644 --- a/test/data/docling_document/unit/ListItem.yaml +++ b/test/data/docling_document/unit/ListItem.yaml @@ -10,3 +10,4 @@ text: whatever content_layer: body formatting: null hyperlink: null +meta: null diff --git a/test/data/docling_document/unit/PictureItem.yaml b/test/data/docling_document/unit/PictureItem.yaml index ffe342a6..3fc72158 100644 --- a/test/data/docling_document/unit/PictureItem.yaml +++ b/test/data/docling_document/unit/PictureItem.yaml @@ -8,4 +8,5 @@ parent: null prov: [] references: [] self_ref: '#' -content_layer: body \ No newline at end of file +content_layer: body +meta: null diff --git a/test/data/docling_document/unit/SectionHeaderItem.yaml b/test/data/docling_document/unit/SectionHeaderItem.yaml index 68f641f9..1ab1a526 100644 --- a/test/data/docling_document/unit/SectionHeaderItem.yaml +++ b/test/data/docling_document/unit/SectionHeaderItem.yaml @@ -9,3 +9,4 @@ text: whatever content_layer: body formatting: null hyperlink: null +meta: null diff --git a/test/data/docling_document/unit/TableItem.yaml b/test/data/docling_document/unit/TableItem.yaml index ae08e00e..b93a89bd 100644 --- a/test/data/docling_document/unit/TableItem.yaml +++ b/test/data/docling_document/unit/TableItem.yaml @@ -194,3 +194,4 @@ references: [] self_ref: '#' content_layer: body annotations: [] +meta: null diff --git a/test/data/docling_document/unit/TextItem.yaml b/test/data/docling_document/unit/TextItem.yaml index 1f72637a..7061046a 100644 --- a/test/data/docling_document/unit/TextItem.yaml +++ b/test/data/docling_document/unit/TextItem.yaml @@ -8,3 +8,4 @@ text: whatever content_layer: body formatting: null hyperlink: null +meta: null diff --git a/test/data/docling_document/unit/TitleItem.yaml b/test/data/docling_document/unit/TitleItem.yaml index 8e2a3dea..7fcbb4cc 100644 --- a/test/data/docling_document/unit/TitleItem.yaml +++ b/test/data/docling_document/unit/TitleItem.yaml @@ -8,3 +8,4 @@ text: whatever content_layer: body formatting: null hyperlink: null +meta: null diff --git a/test/test_metadata.py b/test/test_metadata.py new file mode 100644 index 00000000..b9e5c594 --- /dev/null +++ b/test/test_metadata.py @@ -0,0 +1,19 @@ +from pathlib import Path + +from docling_core.types.doc.document import DoclingDocument, RefItem + +from .test_data_gen_flag import GEN_TEST_DATA + + +def test_metadata(): + src = Path("test/data/doc/dummy_doc_with_meta.yaml") + doc = DoclingDocument.load_from_yaml(filename=src) + example_item = RefItem(cref="#/texts/2").resolve(doc=doc) + example_item.meta.example_custom_field_added_programmaticaly = True + + exp_file = src.parent / f"{src.stem}_modified.yaml" + if GEN_TEST_DATA: + doc.save_as_yaml(filename=exp_file) + else: + expected = DoclingDocument.load_from_yaml(filename=exp_file) + assert doc == expected From 83b19489d995a5d9e815594cfc896901c2a54afa Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Tue, 21 Oct 2025 16:30:52 +0200 Subject: [PATCH 02/22] add deprecation, add first migration Signed-off-by: Panos Vagenas --- docling_core/types/doc/document.py | 105 ++++++--- docs/DoclingDocument.json | 206 ++++++++++-------- examples/metadata.ipynb | 101 ++++++--- test/data/doc/dummy_doc_2_prec.yaml | 6 + test/data/doc/dummy_doc_with_meta.yaml | 19 +- .../doc/dummy_doc_with_meta_modified.yaml | 18 +- 6 files changed, 280 insertions(+), 175 deletions(-) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 2a466a52..c1a56778 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -27,6 +27,8 @@ Field, FieldSerializationInfo, StringConstraints, + TypeAdapter, + ValidationError, computed_field, field_serializer, field_validator, @@ -941,39 +943,51 @@ class ContentLayer(str, Enum): DEFAULT_CONTENT_LAYERS = {ContentLayer.BODY} -class BaseMeta(BaseModel): - """Base class for metadata.""" +class BasePrediction(BaseModel): + """Prediction field.""" - model_config = ConfigDict(extra="allow") + confidence: Optional[float] = None + provenance: Optional[str] = None + details: Optional[dict[str, Any]] = None + @field_serializer("confidence") + def _serialize(self, value: float, info: FieldSerializationInfo) -> float: + return round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC) -class SummaryInstance(BaseModel): - """Single summary data point.""" + +class SummaryMetaField(BasePrediction): + """Summary data.""" text: str - confidence: Optional[float] = None - provenance: Optional[str] = None -class SummaryModel(BaseModel): - """Summary data.""" +class BaseMeta(BaseModel): + """Base class for metadata.""" + + model_config = ConfigDict(extra="allow") + summary: Optional[SummaryMetaField] = None + - # convention: the first instance represents the main summary - instances: List[SummaryInstance] = Field(default_factory=list, min_length=1) - # NOTE: if needed, can add validator to coerce simpler forms to instances +class PictureClassificationPrediction(BasePrediction): + """Picture classification instance.""" + class_name: str -class CommonMeta(BaseMeta): - """Common metadata model.""" - summary: Optional[SummaryModel] = None +class PictureClassificationMetaField(BaseModel): + """Picture classification metadata field.""" + + predictions: list[PictureClassificationPrediction] = Field( + default_factory=list, min_length=1 + ) -class PictureMeta(CommonMeta): +class PictureMeta(BaseMeta): """Picture metadata model.""" + classification: Optional[PictureClassificationMetaField] = None + # TODO the previous classes include "kind" for disambiguation, which is not needed here - classification: Optional[PictureClassificationData] = None molecule: Optional[PictureMoleculeData] = None tabular_chart: Optional[PictureTabularChartData] = None line_chart: Optional[PictureLineChartData] = None @@ -983,13 +997,6 @@ class PictureMeta(CommonMeta): scatter_chart: Optional[PictureScatterChartData] = None -class TableMeta(CommonMeta): - """Table metadata model.""" - - # TODO the previous classes include "kind" for disambiguation, which is not needed here - description: Optional[DescriptionAnnotation] = None - - class NodeItem(BaseModel): """NodeItem.""" @@ -1099,7 +1106,7 @@ def _add_sibling( class GroupItem(NodeItem): # Container type, can't be a leaf node """GroupItem.""" - meta: Optional[CommonMeta] = None + meta: Optional[BaseMeta] = None name: str = ( "group" # Name of the group, e.g. "Introduction Chapter", @@ -1151,7 +1158,7 @@ class DocItem( label: DocItemLabel prov: List[ProvenanceItem] = [] - meta: Optional[CommonMeta] = None + meta: Optional[BaseMeta] = None def get_location_tokens( self, @@ -1460,9 +1467,47 @@ class PictureItem(FloatingItem): DocItemLabel.PICTURE ) - annotations: List[PictureDataType] = [] + annotations: Annotated[ + List[PictureDataType], + Field(deprecated="The `annotations` field is deprecated; use `meta` instead."), + ] = [] meta: Optional[PictureMeta] = None + @model_validator(mode="before") + @classmethod + def migrate_annotations_to_meta(cls, data: Any) -> Any: + """Migrate the `annotations` field to `meta`.""" + if isinstance(data, dict) and (annotations := data.get("annotations")): + + for raw_ann in annotations: + # migrate annotations to meta + try: + # Use Pydantic TypeAdapter to validate the annotation type according to the instruction. + + ann: PictureDataType = TypeAdapter(PictureDataType).validate_python( + raw_ann + ) + if isinstance(ann, PictureClassificationData): + # ensure meta field is present + data.setdefault("meta", {}) + data["meta"].setdefault( + "classification", + PictureClassificationMetaField( + predictions=[ + PictureClassificationPrediction( + class_name=pred.class_name, + confidence=pred.confidence, + provenance=ann.provenance, + ) + for pred in ann.predicted_classes + ], + ).model_dump(), + ) + except ValidationError as e: + raise e + + return data + # Convert the image to Base64 def _image_to_base64(self, pil_image, format="PNG"): """Base64 representation of the image.""" @@ -1609,8 +1654,10 @@ class TableItem(FloatingItem): DocItemLabel.TABLE, ] = DocItemLabel.TABLE - annotations: List[TableAnnotationType] = [] - meta: Optional[TableMeta] = None + annotations: Annotated[ + List[TableAnnotationType], + deprecated("The `annotations` field is deprecated; use `meta` instead."), + ] = [] def export_to_dataframe( self, doc: Optional["DoclingDocument"] = None diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index 22e468f9..1b431827 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -1,5 +1,24 @@ { "$defs": { + "BaseMeta": { + "additionalProperties": true, + "description": "Base class for metadata.", + "properties": { + "summary": { + "anyOf": [ + { + "$ref": "#/$defs/SummaryMetaField" + }, + { + "type": "null" + } + ], + "default": null + } + }, + "title": "BaseMeta", + "type": "object" + }, "BoundingBox": { "description": "BoundingBox.", "properties": { @@ -197,7 +216,7 @@ "meta": { "anyOf": [ { - "$ref": "#/$defs/CommonMeta" + "$ref": "#/$defs/BaseMeta" }, { "type": "null" @@ -368,25 +387,6 @@ "title": "CodeLanguageLabel", "type": "string" }, - "CommonMeta": { - "additionalProperties": true, - "description": "Common metadata model.", - "properties": { - "summary": { - "anyOf": [ - { - "$ref": "#/$defs/SummaryModel" - }, - { - "type": "null" - } - ], - "default": null - } - }, - "title": "CommonMeta", - "type": "object" - }, "ContentLayer": { "description": "ContentLayer.", "enum": [ @@ -508,7 +508,7 @@ "meta": { "anyOf": [ { - "$ref": "#/$defs/CommonMeta" + "$ref": "#/$defs/BaseMeta" }, { "type": "null" @@ -642,7 +642,7 @@ "meta": { "anyOf": [ { - "$ref": "#/$defs/CommonMeta" + "$ref": "#/$defs/BaseMeta" }, { "type": "null" @@ -862,7 +862,7 @@ "meta": { "anyOf": [ { - "$ref": "#/$defs/CommonMeta" + "$ref": "#/$defs/BaseMeta" }, { "type": "null" @@ -978,7 +978,7 @@ "meta": { "anyOf": [ { - "$ref": "#/$defs/CommonMeta" + "$ref": "#/$defs/BaseMeta" }, { "type": "null" @@ -1039,7 +1039,7 @@ "meta": { "anyOf": [ { - "$ref": "#/$defs/CommonMeta" + "$ref": "#/$defs/BaseMeta" }, { "type": "null" @@ -1142,7 +1142,7 @@ "meta": { "anyOf": [ { - "$ref": "#/$defs/CommonMeta" + "$ref": "#/$defs/BaseMeta" }, { "type": "null" @@ -1203,7 +1203,7 @@ "meta": { "anyOf": [ { - "$ref": "#/$defs/CommonMeta" + "$ref": "#/$defs/BaseMeta" }, { "type": "null" @@ -1416,6 +1416,72 @@ "title": "PictureClassificationData", "type": "object" }, + "PictureClassificationMetaField": { + "description": "Picture classification metadata field.", + "properties": { + "predictions": { + "items": { + "$ref": "#/$defs/PictureClassificationPrediction" + }, + "minItems": 1, + "title": "Predictions", + "type": "array" + } + }, + "title": "PictureClassificationMetaField", + "type": "object" + }, + "PictureClassificationPrediction": { + "description": "Picture classification instance.", + "properties": { + "confidence": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Confidence" + }, + "provenance": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Provenance" + }, + "details": { + "anyOf": [ + { + "additionalProperties": true, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Details" + }, + "class_name": { + "title": "Class Name", + "type": "string" + } + }, + "required": [ + "class_name" + ], + "title": "PictureClassificationPrediction", + "type": "object" + }, "PictureItem": { "additionalProperties": false, "description": "PictureItem.", @@ -1513,6 +1579,7 @@ }, "annotations": { "default": [], + "deprecated": true, "items": { "discriminator": { "mapping": { @@ -1617,7 +1684,7 @@ "summary": { "anyOf": [ { - "$ref": "#/$defs/SummaryModel" + "$ref": "#/$defs/SummaryMetaField" }, { "type": "null" @@ -1628,7 +1695,7 @@ "classification": { "anyOf": [ { - "$ref": "#/$defs/PictureClassificationData" + "$ref": "#/$defs/PictureClassificationMetaField" }, { "type": "null" @@ -2070,7 +2137,7 @@ "meta": { "anyOf": [ { - "$ref": "#/$defs/CommonMeta" + "$ref": "#/$defs/BaseMeta" }, { "type": "null" @@ -2162,13 +2229,9 @@ "title": "Size", "type": "object" }, - "SummaryInstance": { - "description": "Single summary data point.", + "SummaryMetaField": { + "description": "Summary data.", "properties": { - "text": { - "title": "Text", - "type": "string" - }, "confidence": { "anyOf": [ { @@ -2192,27 +2255,29 @@ ], "default": null, "title": "Provenance" + }, + "details": { + "anyOf": [ + { + "additionalProperties": true, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Details" + }, + "text": { + "title": "Text", + "type": "string" } }, "required": [ "text" ], - "title": "SummaryInstance", - "type": "object" - }, - "SummaryModel": { - "description": "Summary data.", - "properties": { - "instances": { - "items": { - "$ref": "#/$defs/SummaryInstance" - }, - "minItems": 1, - "title": "Instances", - "type": "array" - } - }, - "title": "SummaryModel", + "title": "SummaryMetaField", "type": "object" }, "TableCell": { @@ -2357,7 +2422,7 @@ "meta": { "anyOf": [ { - "$ref": "#/$defs/TableMeta" + "$ref": "#/$defs/BaseMeta" }, { "type": "null" @@ -2422,6 +2487,7 @@ }, "annotations": { "default": [], + "deprecated": true, "items": { "discriminator": { "mapping": { @@ -2450,36 +2516,6 @@ "title": "TableItem", "type": "object" }, - "TableMeta": { - "additionalProperties": true, - "description": "Table metadata model.", - "properties": { - "summary": { - "anyOf": [ - { - "$ref": "#/$defs/SummaryModel" - }, - { - "type": "null" - } - ], - "default": null - }, - "description": { - "anyOf": [ - { - "$ref": "#/$defs/DescriptionAnnotation" - }, - { - "type": "null" - } - ], - "default": null - } - }, - "title": "TableMeta", - "type": "object" - }, "TextItem": { "additionalProperties": false, "description": "TextItem.", @@ -2515,7 +2551,7 @@ "meta": { "anyOf": [ { - "$ref": "#/$defs/CommonMeta" + "$ref": "#/$defs/BaseMeta" }, { "type": "null" @@ -2629,7 +2665,7 @@ "meta": { "anyOf": [ { - "$ref": "#/$defs/CommonMeta" + "$ref": "#/$defs/BaseMeta" }, { "type": "null" diff --git a/examples/metadata.ipynb b/examples/metadata.ipynb index bb70a469..31941f41 100644 --- a/examples/metadata.ipynb +++ b/examples/metadata.ipynb @@ -48,25 +48,15 @@ { "data": { "text/html": [ - "
CommonMeta(\n",
-       "    summary=SummaryModel(\n",
-       "        instances=[\n",
-       "            SummaryInstance(text='This is a title.', confidence=0.95, provenance='model1'),\n",
-       "            SummaryInstance(text='This is a figure.', confidence=0.42, provenance='model2')\n",
-       "        ]\n",
-       "    ),\n",
+       "
BaseMeta(\n",
+       "    summary=SummaryMetaField(confidence=0.95, provenance='model1', details=None, text='This is a title.'),\n",
        "    example_custom_field_1='More stuff here.'\n",
        ")\n",
        "
\n" ], "text/plain": [ - "\u001b[1;35mCommonMeta\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33msummary\u001b[0m=\u001b[1;35mSummaryModel\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33minstances\u001b[0m=\u001b[1m[\u001b[0m\n", - " \u001b[1;35mSummaryInstance\u001b[0m\u001b[1m(\u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'This is a title.'\u001b[0m, \u001b[33mconfidence\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.95\u001b[0m, \u001b[33mprovenance\u001b[0m=\u001b[32m'model1'\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[1;35mSummaryInstance\u001b[0m\u001b[1m(\u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'This is a figure.'\u001b[0m, \u001b[33mconfidence\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.42\u001b[0m, \u001b[33mprovenance\u001b[0m=\u001b[32m'model2'\u001b[0m\u001b[1m)\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", + "\u001b[1;35mBaseMeta\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33msummary\u001b[0m=\u001b[1;35mSummaryMetaField\u001b[0m\u001b[1m(\u001b[0m\u001b[33mconfidence\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.95\u001b[0m, \u001b[33mprovenance\u001b[0m=\u001b[32m'model1'\u001b[0m, \u001b[33mdetails\u001b[0m=\u001b[3;35mNone\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m'This is a title.'\u001b[0m\u001b[1m)\u001b[0m,\n", " \u001b[33mexample_custom_field_1\u001b[0m=\u001b[32m'More stuff here.'\u001b[0m\n", "\u001b[1m)\u001b[0m\n" ] @@ -131,11 +121,7 @@ "text/html": [ "
PictureMeta(\n",
        "    summary=None,\n",
-       "    classification=PictureClassificationData(\n",
-       "        kind='classification',\n",
-       "        provenance='model1',\n",
-       "        predicted_classes=[PictureClassificationClass(class_name='bar_chart', confidence=0.78)]\n",
-       "    ),\n",
+       "    classification=PictureClassificationMetaField(predictions=[]),\n",
        "    molecule=PictureMoleculeData(\n",
        "        kind='molecule_data',\n",
        "        smi='CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1',\n",
@@ -160,11 +146,7 @@
       "text/plain": [
        "\u001b[1;35mPictureMeta\u001b[0m\u001b[1m(\u001b[0m\n",
        "    \u001b[33msummary\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
-       "    \u001b[33mclassification\u001b[0m=\u001b[1;35mPictureClassificationData\u001b[0m\u001b[1m(\u001b[0m\n",
-       "        \u001b[33mkind\u001b[0m=\u001b[32m'classification'\u001b[0m,\n",
-       "        \u001b[33mprovenance\u001b[0m=\u001b[32m'model1'\u001b[0m,\n",
-       "        \u001b[33mpredicted_classes\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mPictureClassificationClass\u001b[0m\u001b[1m(\u001b[0m\u001b[33mclass_name\u001b[0m=\u001b[32m'bar_chart'\u001b[0m, \u001b[33mconfidence\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.78\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m\n",
-       "    \u001b[1m)\u001b[0m,\n",
+       "    \u001b[33mclassification\u001b[0m=\u001b[1;35mPictureClassificationMetaField\u001b[0m\u001b[1m(\u001b[0m\u001b[33mpredictions\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\u001b[1m)\u001b[0m,\n",
        "    \u001b[33mmolecule\u001b[0m=\u001b[1;35mPictureMoleculeData\u001b[0m\u001b[1m(\u001b[0m\n",
        "        \u001b[33mkind\u001b[0m=\u001b[32m'molecule_data'\u001b[0m,\n",
        "        \u001b[33msmi\u001b[0m=\u001b[32m'\u001b[0m\u001b[32mCC1\u001b[0m\u001b[32m=\u001b[0m\u001b[32mNNC\u001b[0m\u001b[32m(\u001b[0m\u001b[32mC2\u001b[0m\u001b[32m=\u001b[0m\u001b[32mCN3C\u001b[0m\u001b[32m=\u001b[0m\u001b[32mCN\u001b[0m\u001b[32m=\u001b[0m\u001b[32mC3C\u001b[0m\u001b[32m(\u001b[0m\u001b[32mCC3\u001b[0m\u001b[32m=\u001b[0m\u001b[32mCC\u001b[0m\u001b[32m(\u001b[0m\u001b[32mF\u001b[0m\u001b[32m)\u001b[0m\u001b[32m=CC\u001b[0m\u001b[32m(\u001b[0m\u001b[32mF\u001b[0m\u001b[32m)\u001b[0m\u001b[32m=C3\u001b[0m\u001b[32m)\u001b[0m\u001b[32m=N2\u001b[0m\u001b[32m)\u001b[0m\u001b[32m=N1'\u001b[0m,\n",
@@ -225,6 +207,59 @@
      "metadata": {},
      "output_type": "display_data"
     },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/76/4wwfs06x6835kcwj4186c0nc0000gn/T/ipykernel_54834/2789842186.py:27: DeprecationWarning: The `annotations` field is deprecated; use `meta` instead.\n",
+      "  console.print(item.annotations)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "
[\n",
+       "    PictureClassificationData(\n",
+       "        kind='classification',\n",
+       "        provenance='model1',\n",
+       "        predicted_classes=[PictureClassificationClass(class_name='bar_chart', confidence=0.78)]\n",
+       "    ),\n",
+       "    DescriptionAnnotation(kind='description', text='...', provenance='model2'),\n",
+       "    PictureMoleculeData(\n",
+       "        kind='molecule_data',\n",
+       "        smi='CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1',\n",
+       "        confidence=0.98,\n",
+       "        class_name='chemistry_molecular_structure',\n",
+       "        segmentation=[(0.0, 0.0), (1.0, 0.0), (0.0, 1.0), (1.0, 1.0)],\n",
+       "        provenance='model3-1.0.0'\n",
+       "    ),\n",
+       "    MiscAnnotation(kind='misc', content={'myanalysis': {'prediction': 'abc'}, 'something_else': {'text': 'aaa'}})\n",
+       "]\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m[\u001b[0m\n", + " \u001b[1;35mPictureClassificationData\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33mkind\u001b[0m=\u001b[32m'classification'\u001b[0m,\n", + " \u001b[33mprovenance\u001b[0m=\u001b[32m'model1'\u001b[0m,\n", + " \u001b[33mpredicted_classes\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mPictureClassificationClass\u001b[0m\u001b[1m(\u001b[0m\u001b[33mclass_name\u001b[0m=\u001b[32m'bar_chart'\u001b[0m, \u001b[33mconfidence\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.78\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m\n", + " \u001b[1m)\u001b[0m,\n", + " \u001b[1;35mDescriptionAnnotation\u001b[0m\u001b[1m(\u001b[0m\u001b[33mkind\u001b[0m=\u001b[32m'description'\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m'...'\u001b[0m, \u001b[33mprovenance\u001b[0m=\u001b[32m'model2'\u001b[0m\u001b[1m)\u001b[0m,\n", + " \u001b[1;35mPictureMoleculeData\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33mkind\u001b[0m=\u001b[32m'molecule_data'\u001b[0m,\n", + " \u001b[33msmi\u001b[0m=\u001b[32m'\u001b[0m\u001b[32mCC1\u001b[0m\u001b[32m=\u001b[0m\u001b[32mNNC\u001b[0m\u001b[32m(\u001b[0m\u001b[32mC2\u001b[0m\u001b[32m=\u001b[0m\u001b[32mCN3C\u001b[0m\u001b[32m=\u001b[0m\u001b[32mCN\u001b[0m\u001b[32m=\u001b[0m\u001b[32mC3C\u001b[0m\u001b[32m(\u001b[0m\u001b[32mCC3\u001b[0m\u001b[32m=\u001b[0m\u001b[32mCC\u001b[0m\u001b[32m(\u001b[0m\u001b[32mF\u001b[0m\u001b[32m)\u001b[0m\u001b[32m=CC\u001b[0m\u001b[32m(\u001b[0m\u001b[32mF\u001b[0m\u001b[32m)\u001b[0m\u001b[32m=C3\u001b[0m\u001b[32m)\u001b[0m\u001b[32m=N2\u001b[0m\u001b[32m)\u001b[0m\u001b[32m=N1'\u001b[0m,\n", + " \u001b[33mconfidence\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.98\u001b[0m,\n", + " \u001b[33mclass_name\u001b[0m=\u001b[32m'chemistry_molecular_structure'\u001b[0m,\n", + " \u001b[33msegmentation\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m0.0\u001b[0m, \u001b[1;36m0.0\u001b[0m\u001b[1m)\u001b[0m, \u001b[1m(\u001b[0m\u001b[1;36m1.0\u001b[0m, \u001b[1;36m0.0\u001b[0m\u001b[1m)\u001b[0m, \u001b[1m(\u001b[0m\u001b[1;36m0.0\u001b[0m, \u001b[1;36m1.0\u001b[0m\u001b[1m)\u001b[0m, \u001b[1m(\u001b[0m\u001b[1;36m1.0\u001b[0m, \u001b[1;36m1.0\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n", + " \u001b[33mprovenance\u001b[0m=\u001b[32m'model3-1.0.0'\u001b[0m\n", + " \u001b[1m)\u001b[0m,\n", + " \u001b[1;35mMiscAnnotation\u001b[0m\u001b[1m(\u001b[0m\u001b[33mkind\u001b[0m=\u001b[32m'misc'\u001b[0m, \u001b[33mcontent\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'myanalysis'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'prediction'\u001b[0m: \u001b[32m'abc'\u001b[0m\u001b[1m}\u001b[0m, \u001b[32m'something_else'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'text'\u001b[0m: \u001b[32m'aaa'\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m\u001b[1m)\u001b[0m\n", + "\u001b[1m]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "data": { "text/html": [ @@ -254,19 +289,15 @@ { "data": { "text/html": [ - "
CommonMeta(\n",
-       "    summary=SummaryModel(\n",
-       "        instances=[SummaryInstance(text='This is a section header.', confidence=None, provenance=None)]\n",
-       "    ),\n",
+       "
BaseMeta(\n",
+       "    summary=SummaryMetaField(confidence=None, provenance=None, details=None, text='This is a section header.'),\n",
        "    example_custom_field_added_programmaticaly=True\n",
        ")\n",
        "
\n" ], "text/plain": [ - "\u001b[1;35mCommonMeta\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33msummary\u001b[0m=\u001b[1;35mSummaryModel\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33minstances\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mSummaryInstance\u001b[0m\u001b[1m(\u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'This is a section header.'\u001b[0m, \u001b[33mconfidence\u001b[0m=\u001b[3;35mNone\u001b[0m, \u001b[33mprovenance\u001b[0m=\u001b[3;35mNone\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", + "\u001b[1;35mBaseMeta\u001b[0m\u001b[1m(\u001b[0m\n", + " \u001b[33msummary\u001b[0m=\u001b[1;35mSummaryMetaField\u001b[0m\u001b[1m(\u001b[0m\u001b[33mconfidence\u001b[0m=\u001b[3;35mNone\u001b[0m, \u001b[33mprovenance\u001b[0m=\u001b[3;35mNone\u001b[0m, \u001b[33mdetails\u001b[0m=\u001b[3;35mNone\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m'This is a section header.'\u001b[0m\u001b[1m)\u001b[0m,\n", " \u001b[33mexample_custom_field_added_programmaticaly\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", "\u001b[1m)\u001b[0m\n" ] @@ -329,7 +360,7 @@ ], "source": [ "from pathlib import Path\n", - "from docling_core.types.doc.document import ContentLayer, DoclingDocument\n", + "from docling_core.types.doc.document import ContentLayer, DoclingDocument, PictureItem\n", "from rich.console import Console\n", "import os\n", "\n", @@ -352,7 +383,9 @@ " console.print(\"item.meta:\")\n", " console.print(item.meta)\n", " console.print(\"item.meta.__pydantic_extra__:\")\n", - " console.print(item.meta.__pydantic_extra__)\n" + " console.print(item.meta.__pydantic_extra__)\n", + " if isinstance(item, PictureItem):\n", + " console.print(item.annotations)\n" ] }, { diff --git a/test/data/doc/dummy_doc_2_prec.yaml b/test/data/doc/dummy_doc_2_prec.yaml index 60cca33f..d7732ad6 100644 --- a/test/data/doc/dummy_doc_2_prec.yaml +++ b/test/data/doc/dummy_doc_2_prec.yaml @@ -80,6 +80,12 @@ pictures: width: 231.0 uri:  label: picture + meta: + classification: + predictions: + - class_name: bar_chart + confidence: 0.8 + provenance: model1 parent: $ref: '#/body' prov: diff --git a/test/data/doc/dummy_doc_with_meta.yaml b/test/data/doc/dummy_doc_with_meta.yaml index 7ce87b77..3e47cea8 100644 --- a/test/data/doc/dummy_doc_with_meta.yaml +++ b/test/data/doc/dummy_doc_with_meta.yaml @@ -81,12 +81,6 @@ pictures: uri:  label: picture meta: - classification: - kind: classification - predicted_classes: - - class_name: bar_chart - confidence: 0.78 - provenance: model1 example_custom_description_field: provenance: model2 text: '...' @@ -211,13 +205,9 @@ texts: meta: example_custom_field_1: More stuff here. summary: - instances: - - confidence: 0.95 - provenance: model1 - text: This is a title. - - confidence: 0.42 - provenance: model2 - text: This is a figure. + confidence: 0.95 + provenance: model1 + text: This is a title. orig: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis' @@ -242,8 +232,7 @@ texts: level: 1 meta: summary: - instances: - - text: This is a section header. + text: This is a section header. orig: OPERATION (cont.) parent: $ref: '#/pictures/0' diff --git a/test/data/doc/dummy_doc_with_meta_modified.yaml b/test/data/doc/dummy_doc_with_meta_modified.yaml index ff5887cd..866047e4 100644 --- a/test/data/doc/dummy_doc_with_meta_modified.yaml +++ b/test/data/doc/dummy_doc_with_meta_modified.yaml @@ -82,11 +82,10 @@ pictures: label: picture meta: classification: - kind: classification - predicted_classes: + predictions: - class_name: bar_chart confidence: 0.78 - provenance: model1 + provenance: model1 example_custom_description_field: provenance: model2 text: '...' @@ -211,13 +210,9 @@ texts: meta: example_custom_field_1: More stuff here. summary: - instances: - - confidence: 0.95 - provenance: model1 - text: This is a title. - - confidence: 0.42 - provenance: model2 - text: This is a figure. + confidence: 0.95 + provenance: model1 + text: This is a title. orig: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis' @@ -243,8 +238,7 @@ texts: meta: example_custom_field_added_programmaticaly: true summary: - instances: - - text: This is a section header. + text: This is a section header. orig: OPERATION (cont.) parent: $ref: '#/pictures/0' From 33e2f6824a58d789f21a35f23dd403714c37fbdc Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Tue, 21 Oct 2025 22:50:34 +0200 Subject: [PATCH 03/22] extend annotations migration Signed-off-by: Panos Vagenas --- docling_core/types/doc/document.py | 87 ++-- docs/DoclingDocument.json | 29 +- examples/metadata.ipynb | 421 ------------------ test/data/chunker/0_out_chunks.json | 78 ++++ test/data/chunker/0b_out_chunks.json | 78 ++++ .../2408.09869v3_enriched_p2_p3_p5.gt.json | 6 + test/data/doc/dummy_doc_2_prec.yaml | 25 ++ test/data/doc/dummy_doc_with_meta.yaml | 27 +- .../doc/dummy_doc_with_meta_modified.yaml | 13 +- 9 files changed, 259 insertions(+), 505 deletions(-) delete mode 100644 examples/metadata.ipynb diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index c1a56778..102c89c0 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -943,12 +943,27 @@ class ContentLayer(str, Enum): DEFAULT_CONTENT_LAYERS = {ContentLayer.BODY} -class BasePrediction(BaseModel): +class ExtraAllowingModel(BaseModel): + """Base model allowing extra fields.""" + + model_config = ConfigDict(extra="allow") + + def _get_extra_dict(self) -> dict[str, Any]: + """Get the extra fields as a dictionary.""" + return self.__pydantic_extra__ or {} + + def _copy_without_extra(self) -> Self: + """Create a copy without the extra fields.""" + return self.model_validate( + self.model_dump(exclude={ex for ex in self._get_extra_dict()}) + ) + + +class BasePrediction(ExtraAllowingModel): """Prediction field.""" confidence: Optional[float] = None provenance: Optional[str] = None - details: Optional[dict[str, Any]] = None @field_serializer("confidence") def _serialize(self, value: float, info: FieldSerializationInfo) -> float: @@ -961,10 +976,9 @@ class SummaryMetaField(BasePrediction): text: str -class BaseMeta(BaseModel): +class BaseMeta(ExtraAllowingModel): """Base class for metadata.""" - model_config = ConfigDict(extra="allow") summary: Optional[SummaryMetaField] = None @@ -974,7 +988,7 @@ class PictureClassificationPrediction(BasePrediction): class_name: str -class PictureClassificationMetaField(BaseModel): +class PictureClassificationMetaField(ExtraAllowingModel): """Picture classification metadata field.""" predictions: list[PictureClassificationPrediction] = Field( @@ -1469,7 +1483,7 @@ class PictureItem(FloatingItem): annotations: Annotated[ List[PictureDataType], - Field(deprecated="The `annotations` field is deprecated; use `meta` instead."), + deprecated("Field `annotations` is deprecated; use `meta` instead."), ] = [] meta: Optional[PictureMeta] = None @@ -1478,34 +1492,55 @@ class PictureItem(FloatingItem): def migrate_annotations_to_meta(cls, data: Any) -> Any: """Migrate the `annotations` field to `meta`.""" if isinstance(data, dict) and (annotations := data.get("annotations")): - + _logger.warning( + "Migrating deprecated `annotations` to `meta`; this will be removed in the future. " + "Note that only the first available instance of each annotation type will be migrated." + ) for raw_ann in annotations: # migrate annotations to meta - try: - # Use Pydantic TypeAdapter to validate the annotation type according to the instruction. + try: ann: PictureDataType = TypeAdapter(PictureDataType).validate_python( raw_ann ) - if isinstance(ann, PictureClassificationData): - # ensure meta field is present - data.setdefault("meta", {}) - data["meta"].setdefault( - "classification", - PictureClassificationMetaField( - predictions=[ - PictureClassificationPrediction( - class_name=pred.class_name, - confidence=pred.confidence, - provenance=ann.provenance, - ) - for pred in ann.predicted_classes - ], - ).model_dump(), - ) except ValidationError as e: raise e + # ensure meta field is present + data.setdefault("meta", {}) + + if isinstance(ann, PictureClassificationData): + data["meta"].setdefault( + "classification", + PictureClassificationMetaField( + predictions=[ + PictureClassificationPrediction( + class_name=pred.class_name, + confidence=pred.confidence, + provenance=ann.provenance, + ) + for pred in ann.predicted_classes + ], + ).model_dump(mode="json"), + ) + # migrate description annotation to summary meta field + elif isinstance(ann, DescriptionAnnotation): + data["meta"].setdefault( + "summary", + SummaryMetaField( + text=ann.text, + provenance=ann.provenance, + ).model_dump(mode="json"), + ) + # TODO add other relevant annotation types... + else: + # fall back to reusing (namespaced) original annotation type name + data["meta"].setdefault( + f"docling_internal_{ann.kind}", + ann.model_dump(mode="json"), + ) + # TODO: add other annotation types to meta + return data # Convert the image to Base64 @@ -1656,7 +1691,7 @@ class TableItem(FloatingItem): annotations: Annotated[ List[TableAnnotationType], - deprecated("The `annotations` field is deprecated; use `meta` instead."), + deprecated("Field `annotations` is deprecated; use `meta` instead."), ] = [] def export_to_dataframe( diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index 1b431827..e3769f21 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -1417,6 +1417,7 @@ "type": "object" }, "PictureClassificationMetaField": { + "additionalProperties": true, "description": "Picture classification metadata field.", "properties": { "predictions": { @@ -1432,6 +1433,7 @@ "type": "object" }, "PictureClassificationPrediction": { + "additionalProperties": true, "description": "Picture classification instance.", "properties": { "confidence": { @@ -1458,19 +1460,6 @@ "default": null, "title": "Provenance" }, - "details": { - "anyOf": [ - { - "additionalProperties": true, - "type": "object" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Details" - }, "class_name": { "title": "Class Name", "type": "string" @@ -2230,6 +2219,7 @@ "type": "object" }, "SummaryMetaField": { + "additionalProperties": true, "description": "Summary data.", "properties": { "confidence": { @@ -2256,19 +2246,6 @@ "default": null, "title": "Provenance" }, - "details": { - "anyOf": [ - { - "additionalProperties": true, - "type": "object" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Details" - }, "text": { "title": "Text", "type": "string" diff --git a/examples/metadata.ipynb b/examples/metadata.ipynb deleted file mode 100644 index 31941f41..00000000 --- a/examples/metadata.ipynb +++ /dev/null @@ -1,421 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "e638ac23", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
───────────────────────────────────────────────────── #/body ──────────────────────────────────────────────────────\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[92m───────────────────────────────────────────────────── \u001b[0m\u001b[1m#\u001b[0m\u001b[1;35m/\u001b[0m\u001b[1;95mbody\u001b[0m\u001b[92m ──────────────────────────────────────────────────────\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
──────────────────────────────────────────────────── #/texts/1 ────────────────────────────────────────────────────\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[92m──────────────────────────────────────────────────── \u001b[0m\u001b[1m#\u001b[0m\u001b[1;35m/texts/\u001b[0m\u001b[1;95m1\u001b[0m\u001b[92m ────────────────────────────────────────────────────\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
item.meta:\n",
-       "
\n" - ], - "text/plain": [ - "item.meta:\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
BaseMeta(\n",
-       "    summary=SummaryMetaField(confidence=0.95, provenance='model1', details=None, text='This is a title.'),\n",
-       "    example_custom_field_1='More stuff here.'\n",
-       ")\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1;35mBaseMeta\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33msummary\u001b[0m=\u001b[1;35mSummaryMetaField\u001b[0m\u001b[1m(\u001b[0m\u001b[33mconfidence\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.95\u001b[0m, \u001b[33mprovenance\u001b[0m=\u001b[32m'model1'\u001b[0m, \u001b[33mdetails\u001b[0m=\u001b[3;35mNone\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m'This is a title.'\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[33mexample_custom_field_1\u001b[0m=\u001b[32m'More stuff here.'\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
item.meta.__pydantic_extra__:\n",
-       "
\n" - ], - "text/plain": [ - "item.meta.__pydantic_extra__:\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
{'example_custom_field_1': 'More stuff here.'}\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1m{\u001b[0m\u001b[32m'example_custom_field_1'\u001b[0m: \u001b[32m'More stuff here.'\u001b[0m\u001b[1m}\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
────────────────────────────────────────────────── #/pictures/0 ───────────────────────────────────────────────────\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[92m────────────────────────────────────────────────── \u001b[0m\u001b[1m#\u001b[0m\u001b[1;35m/pictures/\u001b[0m\u001b[1;95m0\u001b[0m\u001b[92m ───────────────────────────────────────────────────\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
item.meta:\n",
-       "
\n" - ], - "text/plain": [ - "item.meta:\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
PictureMeta(\n",
-       "    summary=None,\n",
-       "    classification=PictureClassificationMetaField(predictions=[]),\n",
-       "    molecule=PictureMoleculeData(\n",
-       "        kind='molecule_data',\n",
-       "        smi='CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1',\n",
-       "        confidence=0.98,\n",
-       "        class_name='chemistry_molecular_structure',\n",
-       "        segmentation=[(0.0, 0.0), (1.0, 0.0), (0.0, 1.0), (1.0, 1.0)],\n",
-       "        provenance='model3-1.0.0'\n",
-       "    ),\n",
-       "    tabular_chart=None,\n",
-       "    line_chart=None,\n",
-       "    bar_chart=None,\n",
-       "    stacked_bar_chart=None,\n",
-       "    pie_chart=None,\n",
-       "    scatter_chart=None,\n",
-       "    example_custom_description_field={'provenance': 'model2', 'text': '...'},\n",
-       "    example_custom_misc_field={\n",
-       "        'content': {'myanalysis': {'prediction': 'abc'}, 'something_else': {'text': 'aaa'}}\n",
-       "    }\n",
-       ")\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1;35mPictureMeta\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33msummary\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", - " \u001b[33mclassification\u001b[0m=\u001b[1;35mPictureClassificationMetaField\u001b[0m\u001b[1m(\u001b[0m\u001b[33mpredictions\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[33mmolecule\u001b[0m=\u001b[1;35mPictureMoleculeData\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mkind\u001b[0m=\u001b[32m'molecule_data'\u001b[0m,\n", - " \u001b[33msmi\u001b[0m=\u001b[32m'\u001b[0m\u001b[32mCC1\u001b[0m\u001b[32m=\u001b[0m\u001b[32mNNC\u001b[0m\u001b[32m(\u001b[0m\u001b[32mC2\u001b[0m\u001b[32m=\u001b[0m\u001b[32mCN3C\u001b[0m\u001b[32m=\u001b[0m\u001b[32mCN\u001b[0m\u001b[32m=\u001b[0m\u001b[32mC3C\u001b[0m\u001b[32m(\u001b[0m\u001b[32mCC3\u001b[0m\u001b[32m=\u001b[0m\u001b[32mCC\u001b[0m\u001b[32m(\u001b[0m\u001b[32mF\u001b[0m\u001b[32m)\u001b[0m\u001b[32m=CC\u001b[0m\u001b[32m(\u001b[0m\u001b[32mF\u001b[0m\u001b[32m)\u001b[0m\u001b[32m=C3\u001b[0m\u001b[32m)\u001b[0m\u001b[32m=N2\u001b[0m\u001b[32m)\u001b[0m\u001b[32m=N1'\u001b[0m,\n", - " \u001b[33mconfidence\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.98\u001b[0m,\n", - " \u001b[33mclass_name\u001b[0m=\u001b[32m'chemistry_molecular_structure'\u001b[0m,\n", - " \u001b[33msegmentation\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m0.0\u001b[0m, \u001b[1;36m0.0\u001b[0m\u001b[1m)\u001b[0m, \u001b[1m(\u001b[0m\u001b[1;36m1.0\u001b[0m, \u001b[1;36m0.0\u001b[0m\u001b[1m)\u001b[0m, \u001b[1m(\u001b[0m\u001b[1;36m0.0\u001b[0m, \u001b[1;36m1.0\u001b[0m\u001b[1m)\u001b[0m, \u001b[1m(\u001b[0m\u001b[1;36m1.0\u001b[0m, \u001b[1;36m1.0\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33mprovenance\u001b[0m=\u001b[32m'model3-1.0.0'\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[33mtabular_chart\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", - " \u001b[33mline_chart\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", - " \u001b[33mbar_chart\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", - " \u001b[33mstacked_bar_chart\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", - " \u001b[33mpie_chart\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", - " \u001b[33mscatter_chart\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", - " \u001b[33mexample_custom_description_field\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'provenance'\u001b[0m: \u001b[32m'model2'\u001b[0m, \u001b[32m'text'\u001b[0m: \u001b[32m'...'\u001b[0m\u001b[1m}\u001b[0m,\n", - " \u001b[33mexample_custom_misc_field\u001b[0m=\u001b[1m{\u001b[0m\n", - " \u001b[32m'content'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'myanalysis'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'prediction'\u001b[0m: \u001b[32m'abc'\u001b[0m\u001b[1m}\u001b[0m, \u001b[32m'something_else'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'text'\u001b[0m: \u001b[32m'aaa'\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m\n", - " \u001b[1m}\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
item.meta.__pydantic_extra__:\n",
-       "
\n" - ], - "text/plain": [ - "item.meta.__pydantic_extra__:\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
{\n",
-       "    'example_custom_description_field': {'provenance': 'model2', 'text': '...'},\n",
-       "    'example_custom_misc_field': {\n",
-       "        'content': {'myanalysis': {'prediction': 'abc'}, 'something_else': {'text': 'aaa'}}\n",
-       "    }\n",
-       "}\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1m{\u001b[0m\n", - " \u001b[32m'example_custom_description_field'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'provenance'\u001b[0m: \u001b[32m'model2'\u001b[0m, \u001b[32m'text'\u001b[0m: \u001b[32m'...'\u001b[0m\u001b[1m}\u001b[0m,\n", - " \u001b[32m'example_custom_misc_field'\u001b[0m: \u001b[1m{\u001b[0m\n", - " \u001b[32m'content'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'myanalysis'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'prediction'\u001b[0m: \u001b[32m'abc'\u001b[0m\u001b[1m}\u001b[0m, \u001b[32m'something_else'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'text'\u001b[0m: \u001b[32m'aaa'\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m\n", - " \u001b[1m}\u001b[0m\n", - "\u001b[1m}\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/var/folders/76/4wwfs06x6835kcwj4186c0nc0000gn/T/ipykernel_54834/2789842186.py:27: DeprecationWarning: The `annotations` field is deprecated; use `meta` instead.\n", - " console.print(item.annotations)\n" - ] - }, - { - "data": { - "text/html": [ - "
[\n",
-       "    PictureClassificationData(\n",
-       "        kind='classification',\n",
-       "        provenance='model1',\n",
-       "        predicted_classes=[PictureClassificationClass(class_name='bar_chart', confidence=0.78)]\n",
-       "    ),\n",
-       "    DescriptionAnnotation(kind='description', text='...', provenance='model2'),\n",
-       "    PictureMoleculeData(\n",
-       "        kind='molecule_data',\n",
-       "        smi='CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1',\n",
-       "        confidence=0.98,\n",
-       "        class_name='chemistry_molecular_structure',\n",
-       "        segmentation=[(0.0, 0.0), (1.0, 0.0), (0.0, 1.0), (1.0, 1.0)],\n",
-       "        provenance='model3-1.0.0'\n",
-       "    ),\n",
-       "    MiscAnnotation(kind='misc', content={'myanalysis': {'prediction': 'abc'}, 'something_else': {'text': 'aaa'}})\n",
-       "]\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1m[\u001b[0m\n", - " \u001b[1;35mPictureClassificationData\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mkind\u001b[0m=\u001b[32m'classification'\u001b[0m,\n", - " \u001b[33mprovenance\u001b[0m=\u001b[32m'model1'\u001b[0m,\n", - " \u001b[33mpredicted_classes\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mPictureClassificationClass\u001b[0m\u001b[1m(\u001b[0m\u001b[33mclass_name\u001b[0m=\u001b[32m'bar_chart'\u001b[0m, \u001b[33mconfidence\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.78\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mDescriptionAnnotation\u001b[0m\u001b[1m(\u001b[0m\u001b[33mkind\u001b[0m=\u001b[32m'description'\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m'...'\u001b[0m, \u001b[33mprovenance\u001b[0m=\u001b[32m'model2'\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[1;35mPictureMoleculeData\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33mkind\u001b[0m=\u001b[32m'molecule_data'\u001b[0m,\n", - " \u001b[33msmi\u001b[0m=\u001b[32m'\u001b[0m\u001b[32mCC1\u001b[0m\u001b[32m=\u001b[0m\u001b[32mNNC\u001b[0m\u001b[32m(\u001b[0m\u001b[32mC2\u001b[0m\u001b[32m=\u001b[0m\u001b[32mCN3C\u001b[0m\u001b[32m=\u001b[0m\u001b[32mCN\u001b[0m\u001b[32m=\u001b[0m\u001b[32mC3C\u001b[0m\u001b[32m(\u001b[0m\u001b[32mCC3\u001b[0m\u001b[32m=\u001b[0m\u001b[32mCC\u001b[0m\u001b[32m(\u001b[0m\u001b[32mF\u001b[0m\u001b[32m)\u001b[0m\u001b[32m=CC\u001b[0m\u001b[32m(\u001b[0m\u001b[32mF\u001b[0m\u001b[32m)\u001b[0m\u001b[32m=C3\u001b[0m\u001b[32m)\u001b[0m\u001b[32m=N2\u001b[0m\u001b[32m)\u001b[0m\u001b[32m=N1'\u001b[0m,\n", - " \u001b[33mconfidence\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.98\u001b[0m,\n", - " \u001b[33mclass_name\u001b[0m=\u001b[32m'chemistry_molecular_structure'\u001b[0m,\n", - " \u001b[33msegmentation\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m0.0\u001b[0m, \u001b[1;36m0.0\u001b[0m\u001b[1m)\u001b[0m, \u001b[1m(\u001b[0m\u001b[1;36m1.0\u001b[0m, \u001b[1;36m0.0\u001b[0m\u001b[1m)\u001b[0m, \u001b[1m(\u001b[0m\u001b[1;36m0.0\u001b[0m, \u001b[1;36m1.0\u001b[0m\u001b[1m)\u001b[0m, \u001b[1m(\u001b[0m\u001b[1;36m1.0\u001b[0m, \u001b[1;36m1.0\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n", - " \u001b[33mprovenance\u001b[0m=\u001b[32m'model3-1.0.0'\u001b[0m\n", - " \u001b[1m)\u001b[0m,\n", - " \u001b[1;35mMiscAnnotation\u001b[0m\u001b[1m(\u001b[0m\u001b[33mkind\u001b[0m=\u001b[32m'misc'\u001b[0m, \u001b[33mcontent\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'myanalysis'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'prediction'\u001b[0m: \u001b[32m'abc'\u001b[0m\u001b[1m}\u001b[0m, \u001b[32m'something_else'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'text'\u001b[0m: \u001b[32m'aaa'\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m\u001b[1m)\u001b[0m\n", - "\u001b[1m]\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
──────────────────────────────────────────────────── #/texts/2 ────────────────────────────────────────────────────\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[92m──────────────────────────────────────────────────── \u001b[0m\u001b[1m#\u001b[0m\u001b[1;35m/texts/\u001b[0m\u001b[1;95m2\u001b[0m\u001b[92m ────────────────────────────────────────────────────\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
item.meta:\n",
-       "
\n" - ], - "text/plain": [ - "item.meta:\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
BaseMeta(\n",
-       "    summary=SummaryMetaField(confidence=None, provenance=None, details=None, text='This is a section header.'),\n",
-       "    example_custom_field_added_programmaticaly=True\n",
-       ")\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1;35mBaseMeta\u001b[0m\u001b[1m(\u001b[0m\n", - " \u001b[33msummary\u001b[0m=\u001b[1;35mSummaryMetaField\u001b[0m\u001b[1m(\u001b[0m\u001b[33mconfidence\u001b[0m=\u001b[3;35mNone\u001b[0m, \u001b[33mprovenance\u001b[0m=\u001b[3;35mNone\u001b[0m, \u001b[33mdetails\u001b[0m=\u001b[3;35mNone\u001b[0m, \u001b[33mtext\u001b[0m=\u001b[32m'This is a section header.'\u001b[0m\u001b[1m)\u001b[0m,\n", - " \u001b[33mexample_custom_field_added_programmaticaly\u001b[0m=\u001b[3;92mTrue\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
item.meta.__pydantic_extra__:\n",
-       "
\n" - ], - "text/plain": [ - "item.meta.__pydantic_extra__:\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
{'example_custom_field_added_programmaticaly': True}\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1m{\u001b[0m\u001b[32m'example_custom_field_added_programmaticaly'\u001b[0m: \u001b[3;92mTrue\u001b[0m\u001b[1m}\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
──────────────────────────────────────────────────── #/texts/3 ────────────────────────────────────────────────────\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[92m──────────────────────────────────────────────────── \u001b[0m\u001b[1m#\u001b[0m\u001b[1;35m/texts/\u001b[0m\u001b[1;95m3\u001b[0m\u001b[92m ────────────────────────────────────────────────────\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
─────────────────────────────────────────────────── #/tables/0 ────────────────────────────────────────────────────\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[92m─────────────────────────────────────────────────── \u001b[0m\u001b[1m#\u001b[0m\u001b[1;35m/tables/\u001b[0m\u001b[1;95m0\u001b[0m\u001b[92m ────────────────────────────────────────────────────\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from pathlib import Path\n", - "from docling_core.types.doc.document import ContentLayer, DoclingDocument, PictureItem\n", - "from rich.console import Console\n", - "import os\n", - "\n", - "source_path = Path(\"../test/data/doc/dummy_doc_with_meta.yaml\")\n", - "\n", - "doc = DoclingDocument.load_from_yaml(filename=source_path)\n", - "# doc._validate_rules()\n", - "\n", - "console = Console()\n", - "\n", - "for item, i in doc.iterate_items(\n", - " included_content_layers={cl for cl in ContentLayer}, with_groups=True, traverse_pictures=True):\n", - " console.rule(f\"[bold]{item.self_ref}\")\n", - " if item.meta:\n", - "\n", - " # showcasing adding a new field programmatically\n", - " if item.self_ref == \"#/texts/2\":\n", - " item.meta.example_custom_field_added_programmaticaly = True\n", - "\n", - " console.print(\"item.meta:\")\n", - " console.print(item.meta)\n", - " console.print(\"item.meta.__pydantic_extra__:\")\n", - " console.print(item.meta.__pydantic_extra__)\n", - " if isinstance(item, PictureItem):\n", - " console.print(item.annotations)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c65273e7", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.11" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/test/data/chunker/0_out_chunks.json b/test/data/chunker/0_out_chunks.json index 5eb6ff4c..915a2ce7 100644 --- a/test/data/chunker/0_out_chunks.json +++ b/test/data/chunker/0_out_chunks.json @@ -13,6 +13,12 @@ }, "children": [], "content_layer": "body", + "meta": { + "summary": { + "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image we can see a cartoon image of a duck holding a paper." + } + }, "label": "picture", "prov": [ { @@ -938,6 +944,12 @@ } ], "content_layer": "body", + "meta": { + "summary": { + "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image, we can see some text and images." + } + }, "label": "picture", "prov": [ { @@ -3791,6 +3803,12 @@ } ], "content_layer": "body", + "meta": { + "summary": { + "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image there is a table with some text on it." + } + }, "label": "picture", "prov": [ { @@ -4108,6 +4126,12 @@ } ], "content_layer": "body", + "meta": { + "summary": { + "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image we can see a text." + } + }, "label": "picture", "prov": [ { @@ -4376,6 +4400,12 @@ } ], "content_layer": "body", + "meta": { + "summary": { + "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image I can see the text on the image." + } + }, "label": "picture", "prov": [ { @@ -4606,6 +4636,12 @@ } ], "content_layer": "body", + "meta": { + "summary": { + "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image there is a paper with some text on it." + } + }, "label": "picture", "prov": [ { @@ -5079,6 +5115,12 @@ }, "children": [], "content_layer": "body", + "meta": { + "summary": { + "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image, we can see a table." + } + }, "label": "picture", "prov": [ { @@ -5208,6 +5250,12 @@ }, "children": [], "content_layer": "body", + "meta": { + "summary": { + "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "The image is a line graph that shows the percentage of respondents who have used a specific type of training in the past 24 hours. The x-axis represents the time in hours, while the y-axis represents the percentage of respondents. The graph is titled \"Training.\"\n\nThe graph has two lines: one for the training type and one for the training duration. The training type line is labeled \"Training\" and the training duration line is labeled \"Training Duration.\"\n\nThe graph shows that the percentage of respondents who have used the training type increases as the time increases. Specifically, the percentage of respondents who have used the training type increases from 10% to 20% over the 24-hour period. The percentage of respondents who have used the training duration increases from 10% to 20% over the 24-hour period.\n\nThe graph also shows that the percentage of respondents who have used the training type increases as the" + } + }, "label": "picture", "prov": [ { @@ -5771,6 +5819,12 @@ } ], "content_layer": "body", + "meta": { + "summary": { + "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "The image consists of a blue circle with the letter \"A\" written in white color. The circle is placed on a white background, which makes the letter \"A\" stand out prominently. The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom. The gradient effect gives a sense of depth and movement, making the letter \"A\" stand out more.\n\n### Description of Objects in the Image:\n1. **Circle**: The central element of the image is a blue circle.\n2. **White Letter \"A\"**: The letter \"A\" is written in white color.\n3. **Background**: The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom.\n4. **Color Gradient**: The gradient effect is present, with the color transitioning from a lighter shade at the top to a darker shade at the bottom." + } + }, "label": "picture", "prov": [ { @@ -6292,6 +6346,12 @@ } ], "content_layer": "body", + "meta": { + "summary": { + "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image, there is a table with two columns. The first column is labeled \"Class label,\" and the second column is labeled \"Count.\" The first row in the table has the label \"Class label,\" and the count is 22524. The second row in the table has the label \"Count,\" and the count is 204." + } + }, "label": "picture", "prov": [ { @@ -6382,6 +6442,12 @@ } ], "content_layer": "body", + "meta": { + "summary": { + "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image I can see a blue circle." + } + }, "label": "picture", "prov": [ { @@ -7193,6 +7259,12 @@ } ], "content_layer": "body", + "meta": { + "summary": { + "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "A table with different columns and rows." + } + }, "label": "picture", "prov": [ { @@ -7879,6 +7951,12 @@ } ], "content_layer": "body", + "meta": { + "summary": { + "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image there is a table with some text on it." + } + }, "label": "picture", "prov": [ { diff --git a/test/data/chunker/0b_out_chunks.json b/test/data/chunker/0b_out_chunks.json index a242c810..49ad9d4e 100644 --- a/test/data/chunker/0b_out_chunks.json +++ b/test/data/chunker/0b_out_chunks.json @@ -13,6 +13,12 @@ }, "children": [], "content_layer": "body", + "meta": { + "summary": { + "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image we can see a cartoon image of a duck holding a paper." + } + }, "label": "picture", "prov": [ { @@ -938,6 +944,12 @@ } ], "content_layer": "body", + "meta": { + "summary": { + "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image, we can see some text and images." + } + }, "label": "picture", "prov": [ { @@ -3791,6 +3803,12 @@ } ], "content_layer": "body", + "meta": { + "summary": { + "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image there is a table with some text on it." + } + }, "label": "picture", "prov": [ { @@ -4108,6 +4126,12 @@ } ], "content_layer": "body", + "meta": { + "summary": { + "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image we can see a text." + } + }, "label": "picture", "prov": [ { @@ -4376,6 +4400,12 @@ } ], "content_layer": "body", + "meta": { + "summary": { + "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image I can see the text on the image." + } + }, "label": "picture", "prov": [ { @@ -4606,6 +4636,12 @@ } ], "content_layer": "body", + "meta": { + "summary": { + "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image there is a paper with some text on it." + } + }, "label": "picture", "prov": [ { @@ -5079,6 +5115,12 @@ }, "children": [], "content_layer": "body", + "meta": { + "summary": { + "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image, we can see a table." + } + }, "label": "picture", "prov": [ { @@ -5208,6 +5250,12 @@ }, "children": [], "content_layer": "body", + "meta": { + "summary": { + "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "The image is a line graph that shows the percentage of respondents who have used a specific type of training in the past 24 hours. The x-axis represents the time in hours, while the y-axis represents the percentage of respondents. The graph is titled \"Training.\"\n\nThe graph has two lines: one for the training type and one for the training duration. The training type line is labeled \"Training\" and the training duration line is labeled \"Training Duration.\"\n\nThe graph shows that the percentage of respondents who have used the training type increases as the time increases. Specifically, the percentage of respondents who have used the training type increases from 10% to 20% over the 24-hour period. The percentage of respondents who have used the training duration increases from 10% to 20% over the 24-hour period.\n\nThe graph also shows that the percentage of respondents who have used the training type increases as the" + } + }, "label": "picture", "prov": [ { @@ -5771,6 +5819,12 @@ } ], "content_layer": "body", + "meta": { + "summary": { + "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "The image consists of a blue circle with the letter \"A\" written in white color. The circle is placed on a white background, which makes the letter \"A\" stand out prominently. The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom. The gradient effect gives a sense of depth and movement, making the letter \"A\" stand out more.\n\n### Description of Objects in the Image:\n1. **Circle**: The central element of the image is a blue circle.\n2. **White Letter \"A\"**: The letter \"A\" is written in white color.\n3. **Background**: The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom.\n4. **Color Gradient**: The gradient effect is present, with the color transitioning from a lighter shade at the top to a darker shade at the bottom." + } + }, "label": "picture", "prov": [ { @@ -6292,6 +6346,12 @@ } ], "content_layer": "body", + "meta": { + "summary": { + "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image, there is a table with two columns. The first column is labeled \"Class label,\" and the second column is labeled \"Count.\" The first row in the table has the label \"Class label,\" and the count is 22524. The second row in the table has the label \"Count,\" and the count is 204." + } + }, "label": "picture", "prov": [ { @@ -6382,6 +6442,12 @@ } ], "content_layer": "body", + "meta": { + "summary": { + "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image I can see a blue circle." + } + }, "label": "picture", "prov": [ { @@ -7193,6 +7259,12 @@ } ], "content_layer": "body", + "meta": { + "summary": { + "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "A table with different columns and rows." + } + }, "label": "picture", "prov": [ { @@ -7879,6 +7951,12 @@ } ], "content_layer": "body", + "meta": { + "summary": { + "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image there is a table with some text on it." + } + }, "label": "picture", "prov": [ { diff --git a/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json b/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json index 7bbddf7b..b3bf77d9 100644 --- a/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json +++ b/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json @@ -1901,6 +1901,12 @@ } ], "content_layer": "body", + "meta": { + "summary": { + "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image, we can see some text and images." + } + }, "label": "picture", "prov": [ { diff --git a/test/data/doc/dummy_doc_2_prec.yaml b/test/data/doc/dummy_doc_2_prec.yaml index d7732ad6..f555edde 100644 --- a/test/data/doc/dummy_doc_2_prec.yaml +++ b/test/data/doc/dummy_doc_2_prec.yaml @@ -86,6 +86,31 @@ pictures: - class_name: bar_chart confidence: 0.8 provenance: model1 + docling_internal_misc: + content: + myanalysis: + prediction: abc + something_else: + text: aaa + kind: misc + docling_internal_molecule_data: + class_name: chemistry_molecular_structure + confidence: 0.9876 + kind: molecule_data + provenance: model3-1.0.0 + segmentation: + - - 0.0 + - 0.0 + - - 1.0 + - 0.0 + - - 0.0 + - 1.0 + - - 1.0 + - 1.0 + smi: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 + summary: + provenance: model2 + text: '...' parent: $ref: '#/body' prov: diff --git a/test/data/doc/dummy_doc_with_meta.yaml b/test/data/doc/dummy_doc_with_meta.yaml index 3e47cea8..9a590549 100644 --- a/test/data/doc/dummy_doc_with_meta.yaml +++ b/test/data/doc/dummy_doc_with_meta.yaml @@ -80,31 +80,6 @@ pictures: width: 231.0 uri:  label: picture - meta: - example_custom_description_field: - provenance: model2 - text: '...' - example_custom_misc_field: - content: - myanalysis: - prediction: abc - something_else: - text: aaa - molecule: - class_name: chemistry_molecular_structure - confidence: 0.98 - kind: molecule_data - provenance: model3-1.0.0 - segmentation: - - - 0.0 - - 0.0 - - - 1.0 - - 0.0 - - - 0.0 - - 1.0 - - - 1.0 - - 1.0 - smi: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 parent: $ref: '#/body' prov: @@ -203,11 +178,11 @@ texts: content_layer: body label: title meta: - example_custom_field_1: More stuff here. summary: confidence: 0.95 provenance: model1 text: This is a title. + my_corp_custom_field_1: More stuff here. orig: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis' diff --git a/test/data/doc/dummy_doc_with_meta_modified.yaml b/test/data/doc/dummy_doc_with_meta_modified.yaml index 866047e4..030cf785 100644 --- a/test/data/doc/dummy_doc_with_meta_modified.yaml +++ b/test/data/doc/dummy_doc_with_meta_modified.yaml @@ -86,16 +86,14 @@ pictures: - class_name: bar_chart confidence: 0.78 provenance: model1 - example_custom_description_field: - provenance: model2 - text: '...' - example_custom_misc_field: + docling_internal_misc: content: myanalysis: prediction: abc something_else: text: aaa - molecule: + kind: misc + docling_internal_molecule_data: class_name: chemistry_molecular_structure confidence: 0.98 kind: molecule_data @@ -110,6 +108,9 @@ pictures: - - 1.0 - 1.0 smi: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 + summary: + provenance: model2 + text: '...' parent: $ref: '#/body' prov: @@ -208,7 +209,7 @@ texts: content_layer: body label: title meta: - example_custom_field_1: More stuff here. + my_corp_custom_field_1: More stuff here. summary: confidence: 0.95 provenance: model1 From e7f278c3febfd793d9b98e4811675c98b73ff333 Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Fri, 24 Oct 2025 13:03:46 +0200 Subject: [PATCH 04/22] update with feedback Signed-off-by: Panos Vagenas --- docling_core/types/doc/document.py | 155 +++++++++++--- docs/DoclingDocument.json | 194 ++++++++++-------- test/data/chunker/0_out_chunks.json | 52 ++--- test/data/chunker/0b_out_chunks.json | 52 ++--- .../2408.09869v3_enriched_p2_p3_p5.gt.json | 10 +- test/data/doc/dummy_doc_2_prec.yaml | 27 +-- .../doc/dummy_doc_with_meta_modified.yaml | 25 ++- 7 files changed, 333 insertions(+), 182 deletions(-) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 102c89c0..b493f503 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -943,7 +943,7 @@ class ContentLayer(str, Enum): DEFAULT_CONTENT_LAYERS = {ContentLayer.BODY} -class ExtraAllowingModel(BaseModel): +class _ExtraAllowingModel(BaseModel): """Base model allowing extra fields.""" model_config = ConfigDict(extra="allow") @@ -959,11 +959,21 @@ def _copy_without_extra(self) -> Self: ) -class BasePrediction(ExtraAllowingModel): +class BasePrediction(_ExtraAllowingModel): """Prediction field.""" - confidence: Optional[float] = None - provenance: Optional[str] = None + confidence: Optional[float] = Field( + default=None, + ge=0, + le=1, + description="The confidence of the prediction.", + examples=[0.9, 0.42], + ) + # source: Optional[str] = Field( + # default=None, + # description="The origin of the prediction.", + # examples=["ibm-granite/granite-docling-258M"], + # ) @field_serializer("confidence") def _serialize(self, value: float, info: FieldSerializationInfo) -> float: @@ -976,7 +986,7 @@ class SummaryMetaField(BasePrediction): text: str -class BaseMeta(ExtraAllowingModel): +class BaseMeta(_ExtraAllowingModel): """Base class for metadata.""" summary: Optional[SummaryMetaField] = None @@ -988,7 +998,7 @@ class PictureClassificationPrediction(BasePrediction): class_name: str -class PictureClassificationMetaField(ExtraAllowingModel): +class PictureClassificationMetaField(_ExtraAllowingModel): """Picture classification metadata field.""" predictions: list[PictureClassificationPrediction] = Field( @@ -996,19 +1006,29 @@ class PictureClassificationMetaField(ExtraAllowingModel): ) +class MoleculeMetaField(BasePrediction): + """Molecule metadata field.""" + + # TODO: remove / rename / document / further specify fields? + + smi: str + class_name: str + segmentation: List[Tuple[float, float]] + + +class TabularChartMetaField(BasePrediction): + """Tabular chart metadata field.""" + + title: str + chart_data: TableData + + class PictureMeta(BaseMeta): - """Picture metadata model.""" + """Metadata model for pictures.""" classification: Optional[PictureClassificationMetaField] = None - - # TODO the previous classes include "kind" for disambiguation, which is not needed here - molecule: Optional[PictureMoleculeData] = None - tabular_chart: Optional[PictureTabularChartData] = None - line_chart: Optional[PictureLineChartData] = None - bar_chart: Optional[PictureBarChartData] = None - stacked_bar_chart: Optional[PictureStackedBarChartData] = None - pie_chart: Optional[PicturePieChartData] = None - scatter_chart: Optional[PictureScatterChartData] = None + molecule: Optional[MoleculeMetaField] = None + tabular_chart: Optional[TabularChartMetaField] = None class NodeItem(BaseModel): @@ -1474,6 +1494,12 @@ class FormulaItem(TextItem): ) +def _create_internal_meta_field_name( + suffix: str, prefix: str = "docling_internal_" +) -> str: + return f"{prefix}{suffix}" + + class PictureItem(FloatingItem): """PictureItem.""" @@ -1481,11 +1507,11 @@ class PictureItem(FloatingItem): DocItemLabel.PICTURE ) + meta: Optional[PictureMeta] = None annotations: Annotated[ List[PictureDataType], deprecated("Field `annotations` is deprecated; use `meta` instead."), ] = [] - meta: Optional[PictureMeta] = None @model_validator(mode="before") @classmethod @@ -1517,7 +1543,11 @@ def migrate_annotations_to_meta(cls, data: Any) -> Any: PictureClassificationPrediction( class_name=pred.class_name, confidence=pred.confidence, - provenance=ann.provenance, + **{ + _create_internal_meta_field_name( + "provenance" + ): ann.provenance + }, ) for pred in ann.predicted_classes ], @@ -1529,17 +1559,47 @@ def migrate_annotations_to_meta(cls, data: Any) -> Any: "summary", SummaryMetaField( text=ann.text, - provenance=ann.provenance, + **{ + _create_internal_meta_field_name( + "provenance" + ): ann.provenance + }, + ).model_dump(mode="json"), + ) + elif isinstance(ann, PictureMoleculeData): + data["meta"].setdefault( + "molecule", + MoleculeMetaField( + smi=ann.smi, + class_name=ann.class_name, + segmentation=ann.segmentation, + confidence=ann.confidence, + **{ + _create_internal_meta_field_name( + "provenance" + ): ann.provenance + }, + ).model_dump(mode="json"), + ) + elif isinstance(ann, PictureTabularChartData): + data["meta"].setdefault( + "tabular_chart", + TabularChartMetaField( + title=ann.title, + chart_data=ann.chart_data, ).model_dump(mode="json"), ) - # TODO add other relevant annotation types... + elif isinstance(ann, MiscAnnotation): + data["meta"].setdefault( + _create_internal_meta_field_name(ann.kind), + ann.content, + ) else: - # fall back to reusing (namespaced) original annotation type name + # fall back to reusing original annotation type name (in namespaced format) data["meta"].setdefault( - f"docling_internal_{ann.kind}", + _create_internal_meta_field_name(ann.kind), ann.model_dump(mode="json"), ) - # TODO: add other annotation types to meta return data @@ -1694,6 +1754,55 @@ class TableItem(FloatingItem): deprecated("Field `annotations` is deprecated; use `meta` instead."), ] = [] + @model_validator(mode="before") + @classmethod + def migrate_annotations_to_meta(cls, data: Any) -> Any: + """Migrate the `annotations` field to `meta`.""" + if isinstance(data, dict) and (annotations := data.get("annotations")): + _logger.warning( + "Migrating deprecated `annotations` to `meta`; this will be removed in the future. " + "Note that only the first available instance of each annotation type will be migrated." + ) + for raw_ann in annotations: + # migrate annotations to meta + + try: + ann: TableAnnotationType = TypeAdapter( + TableAnnotationType + ).validate_python(raw_ann) + except ValidationError as e: + raise e + + # ensure meta field is present + data.setdefault("meta", {}) + + # migrate description annotation to summary meta field + if isinstance(ann, DescriptionAnnotation): + data["meta"].setdefault( + "summary", + SummaryMetaField( + text=ann.text, + **{ + _create_internal_meta_field_name( + "provenance" + ): ann.provenance + }, + ).model_dump(mode="json"), + ) + elif isinstance(ann, MiscAnnotation): + data["meta"].setdefault( + _create_internal_meta_field_name(ann.kind), + ann.content, + ) + else: + # fall back to reusing original annotation type name (in namespaced format) + data["meta"].setdefault( + _create_internal_meta_field_name(ann.kind), + ann.model_dump(mode="json"), + ) + + return data + def export_to_dataframe( self, doc: Optional["DoclingDocument"] = None ) -> pd.DataFrame: diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index e3769f21..6cadc6be 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -1302,6 +1302,63 @@ "title": "MiscAnnotation", "type": "object" }, + "MoleculeMetaField": { + "additionalProperties": true, + "description": "Molecule metadata field.", + "properties": { + "confidence": { + "anyOf": [ + { + "maximum": 1, + "minimum": 0, + "type": "number" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The confidence of the prediction.", + "examples": [ + 0.9, + 0.42 + ], + "title": "Confidence" + }, + "smi": { + "title": "Smi", + "type": "string" + }, + "class_name": { + "title": "Class Name", + "type": "string" + }, + "segmentation": { + "items": { + "maxItems": 2, + "minItems": 2, + "prefixItems": [ + { + "type": "number" + }, + { + "type": "number" + } + ], + "type": "array" + }, + "title": "Segmentation", + "type": "array" + } + }, + "required": [ + "smi", + "class_name", + "segmentation" + ], + "title": "MoleculeMetaField", + "type": "object" + }, "PageItem": { "description": "PageItem.", "properties": { @@ -1439,6 +1496,8 @@ "confidence": { "anyOf": [ { + "maximum": 1, + "minimum": 0, "type": "number" }, { @@ -1446,19 +1505,12 @@ } ], "default": null, - "title": "Confidence" - }, - "provenance": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } + "description": "The confidence of the prediction.", + "examples": [ + 0.9, + 0.42 ], - "default": null, - "title": "Provenance" + "title": "Confidence" }, "class_name": { "title": "Class Name", @@ -1668,7 +1720,7 @@ }, "PictureMeta": { "additionalProperties": true, - "description": "Picture metadata model.", + "description": "Metadata model for pictures.", "properties": { "summary": { "anyOf": [ @@ -1695,7 +1747,7 @@ "molecule": { "anyOf": [ { - "$ref": "#/$defs/PictureMoleculeData" + "$ref": "#/$defs/MoleculeMetaField" }, { "type": "null" @@ -1706,62 +1758,7 @@ "tabular_chart": { "anyOf": [ { - "$ref": "#/$defs/PictureTabularChartData" - }, - { - "type": "null" - } - ], - "default": null - }, - "line_chart": { - "anyOf": [ - { - "$ref": "#/$defs/PictureLineChartData" - }, - { - "type": "null" - } - ], - "default": null - }, - "bar_chart": { - "anyOf": [ - { - "$ref": "#/$defs/PictureBarChartData" - }, - { - "type": "null" - } - ], - "default": null - }, - "stacked_bar_chart": { - "anyOf": [ - { - "$ref": "#/$defs/PictureStackedBarChartData" - }, - { - "type": "null" - } - ], - "default": null - }, - "pie_chart": { - "anyOf": [ - { - "$ref": "#/$defs/PicturePieChartData" - }, - { - "type": "null" - } - ], - "default": null - }, - "scatter_chart": { - "anyOf": [ - { - "$ref": "#/$defs/PictureScatterChartData" + "$ref": "#/$defs/TabularChartMetaField" }, { "type": "null" @@ -2225,6 +2222,8 @@ "confidence": { "anyOf": [ { + "maximum": 1, + "minimum": 0, "type": "number" }, { @@ -2232,19 +2231,12 @@ } ], "default": null, - "title": "Confidence" - }, - "provenance": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } + "description": "The confidence of the prediction.", + "examples": [ + 0.9, + 0.42 ], - "default": null, - "title": "Provenance" + "title": "Confidence" }, "text": { "title": "Text", @@ -2493,6 +2485,44 @@ "title": "TableItem", "type": "object" }, + "TabularChartMetaField": { + "additionalProperties": true, + "description": "Tabular chart metadata field.", + "properties": { + "confidence": { + "anyOf": [ + { + "maximum": 1, + "minimum": 0, + "type": "number" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The confidence of the prediction.", + "examples": [ + 0.9, + 0.42 + ], + "title": "Confidence" + }, + "title": { + "title": "Title", + "type": "string" + }, + "chart_data": { + "$ref": "#/$defs/TableData" + } + }, + "required": [ + "title", + "chart_data" + ], + "title": "TabularChartMetaField", + "type": "object" + }, "TextItem": { "additionalProperties": false, "description": "TextItem.", diff --git a/test/data/chunker/0_out_chunks.json b/test/data/chunker/0_out_chunks.json index 915a2ce7..158eb910 100644 --- a/test/data/chunker/0_out_chunks.json +++ b/test/data/chunker/0_out_chunks.json @@ -15,8 +15,8 @@ "content_layer": "body", "meta": { "summary": { - "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image we can see a cartoon image of a duck holding a paper." + "text": "In this image we can see a cartoon image of a duck holding a paper.", + "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -946,8 +946,8 @@ "content_layer": "body", "meta": { "summary": { - "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image, we can see some text and images." + "text": "In this image, we can see some text and images.", + "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -3805,8 +3805,8 @@ "content_layer": "body", "meta": { "summary": { - "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image there is a table with some text on it." + "text": "In this image there is a table with some text on it.", + "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -4128,8 +4128,8 @@ "content_layer": "body", "meta": { "summary": { - "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image we can see a text." + "text": "In this image we can see a text.", + "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -4402,8 +4402,8 @@ "content_layer": "body", "meta": { "summary": { - "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image I can see the text on the image." + "text": "In this image I can see the text on the image.", + "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -4638,8 +4638,8 @@ "content_layer": "body", "meta": { "summary": { - "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image there is a paper with some text on it." + "text": "In this image there is a paper with some text on it.", + "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -5117,8 +5117,8 @@ "content_layer": "body", "meta": { "summary": { - "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image, we can see a table." + "text": "In this image, we can see a table.", + "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -5252,8 +5252,8 @@ "content_layer": "body", "meta": { "summary": { - "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "The image is a line graph that shows the percentage of respondents who have used a specific type of training in the past 24 hours. The x-axis represents the time in hours, while the y-axis represents the percentage of respondents. The graph is titled \"Training.\"\n\nThe graph has two lines: one for the training type and one for the training duration. The training type line is labeled \"Training\" and the training duration line is labeled \"Training Duration.\"\n\nThe graph shows that the percentage of respondents who have used the training type increases as the time increases. Specifically, the percentage of respondents who have used the training type increases from 10% to 20% over the 24-hour period. The percentage of respondents who have used the training duration increases from 10% to 20% over the 24-hour period.\n\nThe graph also shows that the percentage of respondents who have used the training type increases as the" + "text": "The image is a line graph that shows the percentage of respondents who have used a specific type of training in the past 24 hours. The x-axis represents the time in hours, while the y-axis represents the percentage of respondents. The graph is titled \"Training.\"\n\nThe graph has two lines: one for the training type and one for the training duration. The training type line is labeled \"Training\" and the training duration line is labeled \"Training Duration.\"\n\nThe graph shows that the percentage of respondents who have used the training type increases as the time increases. Specifically, the percentage of respondents who have used the training type increases from 10% to 20% over the 24-hour period. The percentage of respondents who have used the training duration increases from 10% to 20% over the 24-hour period.\n\nThe graph also shows that the percentage of respondents who have used the training type increases as the", + "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -5821,8 +5821,8 @@ "content_layer": "body", "meta": { "summary": { - "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "The image consists of a blue circle with the letter \"A\" written in white color. The circle is placed on a white background, which makes the letter \"A\" stand out prominently. The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom. The gradient effect gives a sense of depth and movement, making the letter \"A\" stand out more.\n\n### Description of Objects in the Image:\n1. **Circle**: The central element of the image is a blue circle.\n2. **White Letter \"A\"**: The letter \"A\" is written in white color.\n3. **Background**: The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom.\n4. **Color Gradient**: The gradient effect is present, with the color transitioning from a lighter shade at the top to a darker shade at the bottom." + "text": "The image consists of a blue circle with the letter \"A\" written in white color. The circle is placed on a white background, which makes the letter \"A\" stand out prominently. The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom. The gradient effect gives a sense of depth and movement, making the letter \"A\" stand out more.\n\n### Description of Objects in the Image:\n1. **Circle**: The central element of the image is a blue circle.\n2. **White Letter \"A\"**: The letter \"A\" is written in white color.\n3. **Background**: The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom.\n4. **Color Gradient**: The gradient effect is present, with the color transitioning from a lighter shade at the top to a darker shade at the bottom.", + "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -6348,8 +6348,8 @@ "content_layer": "body", "meta": { "summary": { - "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image, there is a table with two columns. The first column is labeled \"Class label,\" and the second column is labeled \"Count.\" The first row in the table has the label \"Class label,\" and the count is 22524. The second row in the table has the label \"Count,\" and the count is 204." + "text": "In this image, there is a table with two columns. The first column is labeled \"Class label,\" and the second column is labeled \"Count.\" The first row in the table has the label \"Class label,\" and the count is 22524. The second row in the table has the label \"Count,\" and the count is 204.", + "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -6444,8 +6444,8 @@ "content_layer": "body", "meta": { "summary": { - "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image I can see a blue circle." + "text": "In this image I can see a blue circle.", + "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -7261,8 +7261,8 @@ "content_layer": "body", "meta": { "summary": { - "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "A table with different columns and rows." + "text": "A table with different columns and rows.", + "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -7953,8 +7953,8 @@ "content_layer": "body", "meta": { "summary": { - "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image there is a table with some text on it." + "text": "In this image there is a table with some text on it.", + "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", diff --git a/test/data/chunker/0b_out_chunks.json b/test/data/chunker/0b_out_chunks.json index 49ad9d4e..5818ddd7 100644 --- a/test/data/chunker/0b_out_chunks.json +++ b/test/data/chunker/0b_out_chunks.json @@ -15,8 +15,8 @@ "content_layer": "body", "meta": { "summary": { - "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image we can see a cartoon image of a duck holding a paper." + "text": "In this image we can see a cartoon image of a duck holding a paper.", + "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -946,8 +946,8 @@ "content_layer": "body", "meta": { "summary": { - "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image, we can see some text and images." + "text": "In this image, we can see some text and images.", + "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -3805,8 +3805,8 @@ "content_layer": "body", "meta": { "summary": { - "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image there is a table with some text on it." + "text": "In this image there is a table with some text on it.", + "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -4128,8 +4128,8 @@ "content_layer": "body", "meta": { "summary": { - "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image we can see a text." + "text": "In this image we can see a text.", + "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -4402,8 +4402,8 @@ "content_layer": "body", "meta": { "summary": { - "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image I can see the text on the image." + "text": "In this image I can see the text on the image.", + "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -4638,8 +4638,8 @@ "content_layer": "body", "meta": { "summary": { - "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image there is a paper with some text on it." + "text": "In this image there is a paper with some text on it.", + "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -5117,8 +5117,8 @@ "content_layer": "body", "meta": { "summary": { - "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image, we can see a table." + "text": "In this image, we can see a table.", + "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -5252,8 +5252,8 @@ "content_layer": "body", "meta": { "summary": { - "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "The image is a line graph that shows the percentage of respondents who have used a specific type of training in the past 24 hours. The x-axis represents the time in hours, while the y-axis represents the percentage of respondents. The graph is titled \"Training.\"\n\nThe graph has two lines: one for the training type and one for the training duration. The training type line is labeled \"Training\" and the training duration line is labeled \"Training Duration.\"\n\nThe graph shows that the percentage of respondents who have used the training type increases as the time increases. Specifically, the percentage of respondents who have used the training type increases from 10% to 20% over the 24-hour period. The percentage of respondents who have used the training duration increases from 10% to 20% over the 24-hour period.\n\nThe graph also shows that the percentage of respondents who have used the training type increases as the" + "text": "The image is a line graph that shows the percentage of respondents who have used a specific type of training in the past 24 hours. The x-axis represents the time in hours, while the y-axis represents the percentage of respondents. The graph is titled \"Training.\"\n\nThe graph has two lines: one for the training type and one for the training duration. The training type line is labeled \"Training\" and the training duration line is labeled \"Training Duration.\"\n\nThe graph shows that the percentage of respondents who have used the training type increases as the time increases. Specifically, the percentage of respondents who have used the training type increases from 10% to 20% over the 24-hour period. The percentage of respondents who have used the training duration increases from 10% to 20% over the 24-hour period.\n\nThe graph also shows that the percentage of respondents who have used the training type increases as the", + "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -5821,8 +5821,8 @@ "content_layer": "body", "meta": { "summary": { - "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "The image consists of a blue circle with the letter \"A\" written in white color. The circle is placed on a white background, which makes the letter \"A\" stand out prominently. The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom. The gradient effect gives a sense of depth and movement, making the letter \"A\" stand out more.\n\n### Description of Objects in the Image:\n1. **Circle**: The central element of the image is a blue circle.\n2. **White Letter \"A\"**: The letter \"A\" is written in white color.\n3. **Background**: The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom.\n4. **Color Gradient**: The gradient effect is present, with the color transitioning from a lighter shade at the top to a darker shade at the bottom." + "text": "The image consists of a blue circle with the letter \"A\" written in white color. The circle is placed on a white background, which makes the letter \"A\" stand out prominently. The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom. The gradient effect gives a sense of depth and movement, making the letter \"A\" stand out more.\n\n### Description of Objects in the Image:\n1. **Circle**: The central element of the image is a blue circle.\n2. **White Letter \"A\"**: The letter \"A\" is written in white color.\n3. **Background**: The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom.\n4. **Color Gradient**: The gradient effect is present, with the color transitioning from a lighter shade at the top to a darker shade at the bottom.", + "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -6348,8 +6348,8 @@ "content_layer": "body", "meta": { "summary": { - "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image, there is a table with two columns. The first column is labeled \"Class label,\" and the second column is labeled \"Count.\" The first row in the table has the label \"Class label,\" and the count is 22524. The second row in the table has the label \"Count,\" and the count is 204." + "text": "In this image, there is a table with two columns. The first column is labeled \"Class label,\" and the second column is labeled \"Count.\" The first row in the table has the label \"Class label,\" and the count is 22524. The second row in the table has the label \"Count,\" and the count is 204.", + "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -6444,8 +6444,8 @@ "content_layer": "body", "meta": { "summary": { - "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image I can see a blue circle." + "text": "In this image I can see a blue circle.", + "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -7261,8 +7261,8 @@ "content_layer": "body", "meta": { "summary": { - "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "A table with different columns and rows." + "text": "A table with different columns and rows.", + "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -7953,8 +7953,8 @@ "content_layer": "body", "meta": { "summary": { - "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image there is a table with some text on it." + "text": "In this image there is a table with some text on it.", + "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", diff --git a/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json b/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json index b3bf77d9..7c8c4d2c 100644 --- a/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json +++ b/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json @@ -1903,8 +1903,8 @@ "content_layer": "body", "meta": { "summary": { - "provenance": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image, we can see some text and images." + "text": "In this image, we can see some text and images.", + "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -1961,6 +1961,12 @@ } ], "content_layer": "body", + "meta": { + "docling_internal_misc": { + "summary": "Typical Docling setup runtime characterization.", + "type": "performance data" + } + }, "label": "table", "prov": [ { diff --git a/test/data/doc/dummy_doc_2_prec.yaml b/test/data/doc/dummy_doc_2_prec.yaml index f555edde..e786456e 100644 --- a/test/data/doc/dummy_doc_2_prec.yaml +++ b/test/data/doc/dummy_doc_2_prec.yaml @@ -85,19 +85,16 @@ pictures: predictions: - class_name: bar_chart confidence: 0.8 - provenance: model1 + docling_internal_provenance: model1 docling_internal_misc: - content: - myanalysis: - prediction: abc - something_else: - text: aaa - kind: misc - docling_internal_molecule_data: + myanalysis: + prediction: abc + something_else: + text: aaa + molecule: class_name: chemistry_molecular_structure - confidence: 0.9876 - kind: molecule_data - provenance: model3-1.0.0 + confidence: 1.0 + docling_internal_provenance: model3-1.0.0 segmentation: - - 0.0 - 0.0 @@ -109,7 +106,7 @@ pictures: - 1.0 smi: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 summary: - provenance: model2 + docling_internal_provenance: model2 text: '...' parent: $ref: '#/body' @@ -170,6 +167,12 @@ tables: width: 231.12 uri:  label: table + meta: + docling_internal_misc: + foo: bar + summary: + docling_internal_provenance: model3 + text: A description annotation for this table. parent: $ref: '#/body' prov: diff --git a/test/data/doc/dummy_doc_with_meta_modified.yaml b/test/data/doc/dummy_doc_with_meta_modified.yaml index 030cf785..632425c4 100644 --- a/test/data/doc/dummy_doc_with_meta_modified.yaml +++ b/test/data/doc/dummy_doc_with_meta_modified.yaml @@ -85,19 +85,16 @@ pictures: predictions: - class_name: bar_chart confidence: 0.78 - provenance: model1 + docling_internal_provenance: model1 docling_internal_misc: - content: - myanalysis: - prediction: abc - something_else: - text: aaa - kind: misc - docling_internal_molecule_data: + myanalysis: + prediction: abc + something_else: + text: aaa + molecule: class_name: chemistry_molecular_structure confidence: 0.98 - kind: molecule_data - provenance: model3-1.0.0 + docling_internal_provenance: model3-1.0.0 segmentation: - - 0.0 - 0.0 @@ -109,7 +106,7 @@ pictures: - 1.0 smi: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 summary: - provenance: model2 + docling_internal_provenance: model2 text: '...' parent: $ref: '#/body' @@ -170,6 +167,12 @@ tables: width: 231.0 uri:  label: table + meta: + docling_internal_misc: + foo: bar + summary: + docling_internal_provenance: model3 + text: A description annotation for this table. parent: $ref: '#/body' prov: From 2002d2d1820caa05836c954e1312e70410904092 Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Fri, 24 Oct 2025 16:26:09 +0200 Subject: [PATCH 05/22] expose main prediction Signed-off-by: Panos Vagenas --- docling_core/types/doc/document.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index b493f503..749f14c5 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -1005,6 +1005,18 @@ class PictureClassificationMetaField(_ExtraAllowingModel): default_factory=list, min_length=1 ) + def get_main_prediction(self) -> PictureClassificationPrediction: + """Get prediction with highest confidence (if confidence not available, first is used by convention).""" + max_conf_pos: Optional[int] = None + max_conf: Optional[float] = None + for i, pred in enumerate(self.predictions): + if pred.confidence is not None and ( + max_conf is None or pred.confidence > max_conf + ): + max_conf_pos = i + max_conf = pred.confidence + return self.predictions[max_conf_pos if max_conf_pos is not None else 0] + class MoleculeMetaField(BasePrediction): """Molecule metadata field.""" From 52cae8d22a5d5a0c467aa5ac11ba77c5c3c3774e Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Fri, 24 Oct 2025 17:56:12 +0200 Subject: [PATCH 06/22] ideas on enforcing separation between standard and custom fields Signed-off-by: Panos Vagenas --- docling_core/types/doc/document.py | 83 ++++++++++++++----- test/data/chunker/0_out_chunks.json | 26 +++--- test/data/chunker/0b_out_chunks.json | 26 +++--- .../2408.09869v3_enriched_p2_p3_p5.gt.json | 4 +- test/data/doc/dummy_doc_2_prec.yaml | 12 +-- test/data/doc/dummy_doc_with_meta.yaml | 4 +- .../doc/dummy_doc_with_meta_modified.yaml | 20 +++-- test/test_metadata.py | 48 +++++++++-- 8 files changed, 151 insertions(+), 72 deletions(-) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 749f14c5..a7cf3a08 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -948,16 +948,41 @@ class _ExtraAllowingModel(BaseModel): model_config = ConfigDict(extra="allow") - def _get_extra_dict(self) -> dict[str, Any]: + def get_custom_part(self) -> dict[str, Any]: """Get the extra fields as a dictionary.""" return self.__pydantic_extra__ or {} def _copy_without_extra(self) -> Self: """Create a copy without the extra fields.""" return self.model_validate( - self.model_dump(exclude={ex for ex in self._get_extra_dict()}) + self.model_dump(exclude={ex for ex in self.get_custom_part()}) ) + def _check_custom_field_format(self, key: str) -> None: + parts = key.split(_META_FIELD_NAMESPACE_DELIMITER, maxsplit=1) + if len(parts) != 2 or (not parts[0]) or (not parts[1]): + raise ValueError( + f"Custom meta field name must be in format 'namespace__field_name' (e.g. 'my_corp__max_size'): {key}" + ) + + @model_validator(mode="after") + def _validate_field_names(self) -> Self: + extra_dict = self.get_custom_part() + for key in self.model_dump(): + if key in extra_dict: + self._check_custom_field_format(key=key) + elif _META_FIELD_NAMESPACE_DELIMITER in key: + raise ValueError( + f"Standard meta field name must not contain '__': {key}" + ) + + return self + + def __setattr__(self, name: str, value: Any) -> None: + super().__setattr__(name, value) + if name in self.get_custom_part(): + self._check_custom_field_format(key=name) + class BasePrediction(_ExtraAllowingModel): """Prediction field.""" @@ -1506,10 +1531,23 @@ class FormulaItem(TextItem): ) -def _create_internal_meta_field_name( - suffix: str, prefix: str = "docling_internal_" +_META_FIELD_NAMESPACE_DELIMITER = "__" + + +def create_meta_field_name( + *, + namespace: str, + name: str, +) -> str: + """Create a meta field name.""" + return f"{namespace}{_META_FIELD_NAMESPACE_DELIMITER}{name}" + + +def _create_migrated_meta_field_name( + *, + name: str, ) -> str: - return f"{prefix}{suffix}" + return create_meta_field_name(namespace="docling_internal", name=name) class PictureItem(FloatingItem): @@ -1527,7 +1565,7 @@ class PictureItem(FloatingItem): @model_validator(mode="before") @classmethod - def migrate_annotations_to_meta(cls, data: Any) -> Any: + def _migrate_annotations_to_meta(cls, data: Any) -> Any: """Migrate the `annotations` field to `meta`.""" if isinstance(data, dict) and (annotations := data.get("annotations")): _logger.warning( @@ -1556,8 +1594,8 @@ def migrate_annotations_to_meta(cls, data: Any) -> Any: class_name=pred.class_name, confidence=pred.confidence, **{ - _create_internal_meta_field_name( - "provenance" + _create_migrated_meta_field_name( + name="provenance" ): ann.provenance }, ) @@ -1572,8 +1610,8 @@ def migrate_annotations_to_meta(cls, data: Any) -> Any: SummaryMetaField( text=ann.text, **{ - _create_internal_meta_field_name( - "provenance" + _create_migrated_meta_field_name( + name="provenance" ): ann.provenance }, ).model_dump(mode="json"), @@ -1587,8 +1625,8 @@ def migrate_annotations_to_meta(cls, data: Any) -> Any: segmentation=ann.segmentation, confidence=ann.confidence, **{ - _create_internal_meta_field_name( - "provenance" + _create_migrated_meta_field_name( + name="provenance" ): ann.provenance }, ).model_dump(mode="json"), @@ -1603,13 +1641,13 @@ def migrate_annotations_to_meta(cls, data: Any) -> Any: ) elif isinstance(ann, MiscAnnotation): data["meta"].setdefault( - _create_internal_meta_field_name(ann.kind), + _create_migrated_meta_field_name(name=ann.kind), ann.content, ) else: # fall back to reusing original annotation type name (in namespaced format) data["meta"].setdefault( - _create_internal_meta_field_name(ann.kind), + _create_migrated_meta_field_name(name=ann.kind), ann.model_dump(mode="json"), ) @@ -1795,21 +1833,21 @@ def migrate_annotations_to_meta(cls, data: Any) -> Any: SummaryMetaField( text=ann.text, **{ - _create_internal_meta_field_name( - "provenance" + _create_migrated_meta_field_name( + name="provenance" ): ann.provenance }, ).model_dump(mode="json"), ) elif isinstance(ann, MiscAnnotation): data["meta"].setdefault( - _create_internal_meta_field_name(ann.kind), + _create_migrated_meta_field_name(name=ann.kind), ann.content, ) else: # fall back to reusing original annotation type name (in namespaced format) data["meta"].setdefault( - _create_internal_meta_field_name(ann.kind), + _create_migrated_meta_field_name(name=ann.kind), ann.model_dump(mode="json"), ) @@ -5789,16 +5827,17 @@ def check_version_is_compatible(cls, v: str) -> str: return CURRENT_VERSION @model_validator(mode="after") # type: ignore - @classmethod - def validate_document(cls, d: "DoclingDocument"): + def validate_document(self) -> Self: """validate_document.""" with warnings.catch_warnings(): # ignore warning from deprecated furniture warnings.filterwarnings("ignore", category=DeprecationWarning) - if not d.validate_tree(d.body) or not d.validate_tree(d.furniture): + if not self.validate_tree(self.body) or not self.validate_tree( + self.furniture + ): raise ValueError("Document hierachy is inconsistent.") - return d + return self @model_validator(mode="after") def validate_misplaced_list_items(self): diff --git a/test/data/chunker/0_out_chunks.json b/test/data/chunker/0_out_chunks.json index 158eb910..af2bcdc1 100644 --- a/test/data/chunker/0_out_chunks.json +++ b/test/data/chunker/0_out_chunks.json @@ -16,7 +16,7 @@ "meta": { "summary": { "text": "In this image we can see a cartoon image of a duck holding a paper.", - "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -947,7 +947,7 @@ "meta": { "summary": { "text": "In this image, we can see some text and images.", - "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -3806,7 +3806,7 @@ "meta": { "summary": { "text": "In this image there is a table with some text on it.", - "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -4129,7 +4129,7 @@ "meta": { "summary": { "text": "In this image we can see a text.", - "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -4403,7 +4403,7 @@ "meta": { "summary": { "text": "In this image I can see the text on the image.", - "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -4639,7 +4639,7 @@ "meta": { "summary": { "text": "In this image there is a paper with some text on it.", - "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -5118,7 +5118,7 @@ "meta": { "summary": { "text": "In this image, we can see a table.", - "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -5253,7 +5253,7 @@ "meta": { "summary": { "text": "The image is a line graph that shows the percentage of respondents who have used a specific type of training in the past 24 hours. The x-axis represents the time in hours, while the y-axis represents the percentage of respondents. The graph is titled \"Training.\"\n\nThe graph has two lines: one for the training type and one for the training duration. The training type line is labeled \"Training\" and the training duration line is labeled \"Training Duration.\"\n\nThe graph shows that the percentage of respondents who have used the training type increases as the time increases. Specifically, the percentage of respondents who have used the training type increases from 10% to 20% over the 24-hour period. The percentage of respondents who have used the training duration increases from 10% to 20% over the 24-hour period.\n\nThe graph also shows that the percentage of respondents who have used the training type increases as the", - "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -5822,7 +5822,7 @@ "meta": { "summary": { "text": "The image consists of a blue circle with the letter \"A\" written in white color. The circle is placed on a white background, which makes the letter \"A\" stand out prominently. The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom. The gradient effect gives a sense of depth and movement, making the letter \"A\" stand out more.\n\n### Description of Objects in the Image:\n1. **Circle**: The central element of the image is a blue circle.\n2. **White Letter \"A\"**: The letter \"A\" is written in white color.\n3. **Background**: The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom.\n4. **Color Gradient**: The gradient effect is present, with the color transitioning from a lighter shade at the top to a darker shade at the bottom.", - "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -6349,7 +6349,7 @@ "meta": { "summary": { "text": "In this image, there is a table with two columns. The first column is labeled \"Class label,\" and the second column is labeled \"Count.\" The first row in the table has the label \"Class label,\" and the count is 22524. The second row in the table has the label \"Count,\" and the count is 204.", - "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -6445,7 +6445,7 @@ "meta": { "summary": { "text": "In this image I can see a blue circle.", - "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -7262,7 +7262,7 @@ "meta": { "summary": { "text": "A table with different columns and rows.", - "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -7954,7 +7954,7 @@ "meta": { "summary": { "text": "In this image there is a table with some text on it.", - "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", diff --git a/test/data/chunker/0b_out_chunks.json b/test/data/chunker/0b_out_chunks.json index 5818ddd7..3bb8d100 100644 --- a/test/data/chunker/0b_out_chunks.json +++ b/test/data/chunker/0b_out_chunks.json @@ -16,7 +16,7 @@ "meta": { "summary": { "text": "In this image we can see a cartoon image of a duck holding a paper.", - "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -947,7 +947,7 @@ "meta": { "summary": { "text": "In this image, we can see some text and images.", - "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -3806,7 +3806,7 @@ "meta": { "summary": { "text": "In this image there is a table with some text on it.", - "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -4129,7 +4129,7 @@ "meta": { "summary": { "text": "In this image we can see a text.", - "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -4403,7 +4403,7 @@ "meta": { "summary": { "text": "In this image I can see the text on the image.", - "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -4639,7 +4639,7 @@ "meta": { "summary": { "text": "In this image there is a paper with some text on it.", - "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -5118,7 +5118,7 @@ "meta": { "summary": { "text": "In this image, we can see a table.", - "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -5253,7 +5253,7 @@ "meta": { "summary": { "text": "The image is a line graph that shows the percentage of respondents who have used a specific type of training in the past 24 hours. The x-axis represents the time in hours, while the y-axis represents the percentage of respondents. The graph is titled \"Training.\"\n\nThe graph has two lines: one for the training type and one for the training duration. The training type line is labeled \"Training\" and the training duration line is labeled \"Training Duration.\"\n\nThe graph shows that the percentage of respondents who have used the training type increases as the time increases. Specifically, the percentage of respondents who have used the training type increases from 10% to 20% over the 24-hour period. The percentage of respondents who have used the training duration increases from 10% to 20% over the 24-hour period.\n\nThe graph also shows that the percentage of respondents who have used the training type increases as the", - "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -5822,7 +5822,7 @@ "meta": { "summary": { "text": "The image consists of a blue circle with the letter \"A\" written in white color. The circle is placed on a white background, which makes the letter \"A\" stand out prominently. The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom. The gradient effect gives a sense of depth and movement, making the letter \"A\" stand out more.\n\n### Description of Objects in the Image:\n1. **Circle**: The central element of the image is a blue circle.\n2. **White Letter \"A\"**: The letter \"A\" is written in white color.\n3. **Background**: The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom.\n4. **Color Gradient**: The gradient effect is present, with the color transitioning from a lighter shade at the top to a darker shade at the bottom.", - "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -6349,7 +6349,7 @@ "meta": { "summary": { "text": "In this image, there is a table with two columns. The first column is labeled \"Class label,\" and the second column is labeled \"Count.\" The first row in the table has the label \"Class label,\" and the count is 22524. The second row in the table has the label \"Count,\" and the count is 204.", - "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -6445,7 +6445,7 @@ "meta": { "summary": { "text": "In this image I can see a blue circle.", - "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -7262,7 +7262,7 @@ "meta": { "summary": { "text": "A table with different columns and rows.", - "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -7954,7 +7954,7 @@ "meta": { "summary": { "text": "In this image there is a table with some text on it.", - "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", diff --git a/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json b/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json index 7c8c4d2c..56d18254 100644 --- a/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json +++ b/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json @@ -1904,7 +1904,7 @@ "meta": { "summary": { "text": "In this image, we can see some text and images.", - "docling_internal_provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" } }, "label": "picture", @@ -1962,7 +1962,7 @@ ], "content_layer": "body", "meta": { - "docling_internal_misc": { + "docling_internal__misc": { "summary": "Typical Docling setup runtime characterization.", "type": "performance data" } diff --git a/test/data/doc/dummy_doc_2_prec.yaml b/test/data/doc/dummy_doc_2_prec.yaml index e786456e..643ddec0 100644 --- a/test/data/doc/dummy_doc_2_prec.yaml +++ b/test/data/doc/dummy_doc_2_prec.yaml @@ -85,8 +85,8 @@ pictures: predictions: - class_name: bar_chart confidence: 0.8 - docling_internal_provenance: model1 - docling_internal_misc: + docling_internal__provenance: model1 + docling_internal__misc: myanalysis: prediction: abc something_else: @@ -94,7 +94,7 @@ pictures: molecule: class_name: chemistry_molecular_structure confidence: 1.0 - docling_internal_provenance: model3-1.0.0 + docling_internal__provenance: model3-1.0.0 segmentation: - - 0.0 - 0.0 @@ -106,7 +106,7 @@ pictures: - 1.0 smi: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 summary: - docling_internal_provenance: model2 + docling_internal__provenance: model2 text: '...' parent: $ref: '#/body' @@ -168,10 +168,10 @@ tables: uri:  label: table meta: - docling_internal_misc: + docling_internal__misc: foo: bar summary: - docling_internal_provenance: model3 + docling_internal__provenance: model3 text: A description annotation for this table. parent: $ref: '#/body' diff --git a/test/data/doc/dummy_doc_with_meta.yaml b/test/data/doc/dummy_doc_with_meta.yaml index 9a590549..27ff9ae8 100644 --- a/test/data/doc/dummy_doc_with_meta.yaml +++ b/test/data/doc/dummy_doc_with_meta.yaml @@ -180,9 +180,9 @@ texts: meta: summary: confidence: 0.95 - provenance: model1 + docling_internal__provenance: model1 text: This is a title. - my_corp_custom_field_1: More stuff here. + my_corp__foo: More stuff here. orig: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis' diff --git a/test/data/doc/dummy_doc_with_meta_modified.yaml b/test/data/doc/dummy_doc_with_meta_modified.yaml index 632425c4..ada5e142 100644 --- a/test/data/doc/dummy_doc_with_meta_modified.yaml +++ b/test/data/doc/dummy_doc_with_meta_modified.yaml @@ -85,8 +85,8 @@ pictures: predictions: - class_name: bar_chart confidence: 0.78 - docling_internal_provenance: model1 - docling_internal_misc: + docling_internal__provenance: model1 + docling_internal__misc: myanalysis: prediction: abc something_else: @@ -94,7 +94,7 @@ pictures: molecule: class_name: chemistry_molecular_structure confidence: 0.98 - docling_internal_provenance: model3-1.0.0 + docling_internal__provenance: model3-1.0.0 segmentation: - - 0.0 - 0.0 @@ -106,7 +106,7 @@ pictures: - 1.0 smi: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 summary: - docling_internal_provenance: model2 + docling_internal__provenance: model2 text: '...' parent: $ref: '#/body' @@ -168,10 +168,10 @@ tables: uri:  label: table meta: - docling_internal_misc: + docling_internal__misc: foo: bar summary: - docling_internal_provenance: model3 + docling_internal__provenance: model3 text: A description annotation for this table. parent: $ref: '#/body' @@ -212,10 +212,10 @@ texts: content_layer: body label: title meta: - my_corp_custom_field_1: More stuff here. + my_corp__foo: More stuff here. summary: confidence: 0.95 - provenance: model1 + docling_internal__provenance: model1 text: This is a title. orig: 'DocLayNet: A Large Human-Annotated Dataset for @@ -240,7 +240,9 @@ texts: label: section_header level: 1 meta: - example_custom_field_added_programmaticaly: true + my_corp__coords: + latitude: 8.5417 + longitude: 47.3769 summary: text: This is a section header. orig: OPERATION (cont.) diff --git a/test/test_metadata.py b/test/test_metadata.py index b9e5c594..48bb4a4e 100644 --- a/test/test_metadata.py +++ b/test/test_metadata.py @@ -1,19 +1,57 @@ from pathlib import Path -from docling_core.types.doc.document import DoclingDocument, RefItem +import pytest +from pydantic import BaseModel + +from docling_core.types.doc.document import ( + DoclingDocument, + NodeItem, + RefItem, + create_meta_field_name, +) from .test_data_gen_flag import GEN_TEST_DATA -def test_metadata(): +def test_metadata_usage(): + class CustomCoordinates(BaseModel): + longitude: float + latitude: float + src = Path("test/data/doc/dummy_doc_with_meta.yaml") doc = DoclingDocument.load_from_yaml(filename=src) - example_item = RefItem(cref="#/texts/2").resolve(doc=doc) - example_item.meta.example_custom_field_added_programmaticaly = True + example_item: NodeItem = RefItem(cref="#/texts/2").resolve(doc=doc) + assert example_item.meta is not None + + # add a custom metadata object to the item + target_name = create_meta_field_name(namespace="my_corp", name="coords") + value = CustomCoordinates(longitude=47.3769, latitude=8.5417) + setattr(example_item.meta, target_name, value) + # save the document exp_file = src.parent / f"{src.stem}_modified.yaml" if GEN_TEST_DATA: doc.save_as_yaml(filename=exp_file) else: expected = DoclingDocument.load_from_yaml(filename=exp_file) - assert doc == expected + assert doc.model_dump(mode="json") == expected.model_dump(mode="json") + + # load back the document and read the custom metadata object + loaded_doc = DoclingDocument.load_from_yaml(filename=exp_file) + loaded_item: NodeItem = RefItem(cref="#/texts/2").resolve(doc=loaded_doc) + assert loaded_item.meta is not None + + loaded_dict = loaded_item.meta.get_custom_part()[target_name] + loaded_value = CustomCoordinates.model_validate(loaded_dict) + + # ensure the value is the same + assert loaded_value == value + + +def test_namespace_absence_raises(): + src = Path("test/data/doc/dummy_doc_with_meta.yaml") + doc = DoclingDocument.load_from_yaml(filename=src) + example_item = RefItem(cref="#/texts/2").resolve(doc=doc) + + with pytest.raises(ValueError): + example_item.meta.my_corp_programmaticaly_added_field = True From 6ce1dba6ceb0ce53ac90a684ee8576afecd06e78 Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Mon, 27 Oct 2025 13:53:29 +0100 Subject: [PATCH 07/22] add custom field setter method Signed-off-by: Panos Vagenas --- docling_core/types/doc/document.py | 52 +++++------ docs/DoclingDocument.json | 90 ++++++++++++++----- test/data/chunker/0_out_chunks.json | 52 +++++------ test/data/chunker/0b_out_chunks.json | 52 +++++------ .../2408.09869v3_enriched_p2_p3_p5.gt.json | 6 +- test/data/doc/dummy_doc_2_prec.yaml | 16 ++-- test/data/doc/dummy_doc_with_meta.yaml | 2 +- .../doc/dummy_doc_with_meta_modified.yaml | 18 ++-- test/test_metadata.py | 13 ++- 9 files changed, 166 insertions(+), 135 deletions(-) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index a7cf3a08..ad53a9d8 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -983,6 +983,12 @@ def __setattr__(self, name: str, value: Any) -> None: if name in self.get_custom_part(): self._check_custom_field_format(key=name) + def set_custom_field(self, namespace: str, name: str, value: Any) -> str: + """Set a custom field and return the key.""" + key = create_meta_field_name(namespace=namespace, name=name) + setattr(self, key, value) + return key + class BasePrediction(_ExtraAllowingModel): """Prediction field.""" @@ -994,11 +1000,11 @@ class BasePrediction(_ExtraAllowingModel): description="The confidence of the prediction.", examples=[0.9, 0.42], ) - # source: Optional[str] = Field( - # default=None, - # description="The origin of the prediction.", - # examples=["ibm-granite/granite-docling-258M"], - # ) + created_by: Optional[str] = Field( + default=None, + description="The origin of the prediction.", + examples=["ibm-granite/granite-docling-258M"], + ) @field_serializer("confidence") def _serialize(self, value: float, info: FieldSerializationInfo) -> float: @@ -1046,11 +1052,7 @@ def get_main_prediction(self) -> PictureClassificationPrediction: class MoleculeMetaField(BasePrediction): """Molecule metadata field.""" - # TODO: remove / rename / document / further specify fields? - - smi: str - class_name: str - segmentation: List[Tuple[float, float]] + smi: str = Field(description="The SMILES representation of the molecule.") class TabularChartMetaField(BasePrediction): @@ -1547,7 +1549,7 @@ def _create_migrated_meta_field_name( *, name: str, ) -> str: - return create_meta_field_name(namespace="docling_internal", name=name) + return create_meta_field_name(namespace="docling_legacy", name=name) class PictureItem(FloatingItem): @@ -1593,11 +1595,7 @@ def _migrate_annotations_to_meta(cls, data: Any) -> Any: PictureClassificationPrediction( class_name=pred.class_name, confidence=pred.confidence, - **{ - _create_migrated_meta_field_name( - name="provenance" - ): ann.provenance - }, + created_by=ann.provenance, ) for pred in ann.predicted_classes ], @@ -1609,11 +1607,7 @@ def _migrate_annotations_to_meta(cls, data: Any) -> Any: "summary", SummaryMetaField( text=ann.text, - **{ - _create_migrated_meta_field_name( - name="provenance" - ): ann.provenance - }, + created_by=ann.provenance, ).model_dump(mode="json"), ) elif isinstance(ann, PictureMoleculeData): @@ -1621,13 +1615,15 @@ def _migrate_annotations_to_meta(cls, data: Any) -> Any: "molecule", MoleculeMetaField( smi=ann.smi, - class_name=ann.class_name, - segmentation=ann.segmentation, confidence=ann.confidence, + created_by=ann.provenance, **{ _create_migrated_meta_field_name( - name="provenance" - ): ann.provenance + name="segmentation" + ): ann.segmentation, + _create_migrated_meta_field_name( + name="class_name" + ): ann.class_name, }, ).model_dump(mode="json"), ) @@ -1832,11 +1828,7 @@ def migrate_annotations_to_meta(cls, data: Any) -> Any: "summary", SummaryMetaField( text=ann.text, - **{ - _create_migrated_meta_field_name( - name="provenance" - ): ann.provenance - }, + created_by=ann.provenance, ).model_dump(mode="json"), ) elif isinstance(ann, MiscAnnotation): diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index 6cadc6be..92528c4b 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -1325,36 +1325,30 @@ ], "title": "Confidence" }, + "created_by": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The origin of the prediction.", + "examples": [ + "ibm-granite/granite-docling-258M" + ], + "title": "Created By" + }, "smi": { + "description": "The SMILES representation of the molecule.", "title": "Smi", "type": "string" - }, - "class_name": { - "title": "Class Name", - "type": "string" - }, - "segmentation": { - "items": { - "maxItems": 2, - "minItems": 2, - "prefixItems": [ - { - "type": "number" - }, - { - "type": "number" - } - ], - "type": "array" - }, - "title": "Segmentation", - "type": "array" } }, "required": [ - "smi", - "class_name", - "segmentation" + "smi" ], "title": "MoleculeMetaField", "type": "object" @@ -1512,6 +1506,22 @@ ], "title": "Confidence" }, + "created_by": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The origin of the prediction.", + "examples": [ + "ibm-granite/granite-docling-258M" + ], + "title": "Created By" + }, "class_name": { "title": "Class Name", "type": "string" @@ -2238,6 +2248,22 @@ ], "title": "Confidence" }, + "created_by": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The origin of the prediction.", + "examples": [ + "ibm-granite/granite-docling-258M" + ], + "title": "Created By" + }, "text": { "title": "Text", "type": "string" @@ -2508,6 +2534,22 @@ ], "title": "Confidence" }, + "created_by": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The origin of the prediction.", + "examples": [ + "ibm-granite/granite-docling-258M" + ], + "title": "Created By" + }, "title": { "title": "Title", "type": "string" diff --git a/test/data/chunker/0_out_chunks.json b/test/data/chunker/0_out_chunks.json index af2bcdc1..a5c1b6df 100644 --- a/test/data/chunker/0_out_chunks.json +++ b/test/data/chunker/0_out_chunks.json @@ -15,8 +15,8 @@ "content_layer": "body", "meta": { "summary": { - "text": "In this image we can see a cartoon image of a duck holding a paper.", - "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image we can see a cartoon image of a duck holding a paper." } }, "label": "picture", @@ -946,8 +946,8 @@ "content_layer": "body", "meta": { "summary": { - "text": "In this image, we can see some text and images.", - "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image, we can see some text and images." } }, "label": "picture", @@ -3805,8 +3805,8 @@ "content_layer": "body", "meta": { "summary": { - "text": "In this image there is a table with some text on it.", - "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image there is a table with some text on it." } }, "label": "picture", @@ -4128,8 +4128,8 @@ "content_layer": "body", "meta": { "summary": { - "text": "In this image we can see a text.", - "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image we can see a text." } }, "label": "picture", @@ -4402,8 +4402,8 @@ "content_layer": "body", "meta": { "summary": { - "text": "In this image I can see the text on the image.", - "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image I can see the text on the image." } }, "label": "picture", @@ -4638,8 +4638,8 @@ "content_layer": "body", "meta": { "summary": { - "text": "In this image there is a paper with some text on it.", - "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image there is a paper with some text on it." } }, "label": "picture", @@ -5117,8 +5117,8 @@ "content_layer": "body", "meta": { "summary": { - "text": "In this image, we can see a table.", - "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image, we can see a table." } }, "label": "picture", @@ -5252,8 +5252,8 @@ "content_layer": "body", "meta": { "summary": { - "text": "The image is a line graph that shows the percentage of respondents who have used a specific type of training in the past 24 hours. The x-axis represents the time in hours, while the y-axis represents the percentage of respondents. The graph is titled \"Training.\"\n\nThe graph has two lines: one for the training type and one for the training duration. The training type line is labeled \"Training\" and the training duration line is labeled \"Training Duration.\"\n\nThe graph shows that the percentage of respondents who have used the training type increases as the time increases. Specifically, the percentage of respondents who have used the training type increases from 10% to 20% over the 24-hour period. The percentage of respondents who have used the training duration increases from 10% to 20% over the 24-hour period.\n\nThe graph also shows that the percentage of respondents who have used the training type increases as the", - "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "The image is a line graph that shows the percentage of respondents who have used a specific type of training in the past 24 hours. The x-axis represents the time in hours, while the y-axis represents the percentage of respondents. The graph is titled \"Training.\"\n\nThe graph has two lines: one for the training type and one for the training duration. The training type line is labeled \"Training\" and the training duration line is labeled \"Training Duration.\"\n\nThe graph shows that the percentage of respondents who have used the training type increases as the time increases. Specifically, the percentage of respondents who have used the training type increases from 10% to 20% over the 24-hour period. The percentage of respondents who have used the training duration increases from 10% to 20% over the 24-hour period.\n\nThe graph also shows that the percentage of respondents who have used the training type increases as the" } }, "label": "picture", @@ -5821,8 +5821,8 @@ "content_layer": "body", "meta": { "summary": { - "text": "The image consists of a blue circle with the letter \"A\" written in white color. The circle is placed on a white background, which makes the letter \"A\" stand out prominently. The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom. The gradient effect gives a sense of depth and movement, making the letter \"A\" stand out more.\n\n### Description of Objects in the Image:\n1. **Circle**: The central element of the image is a blue circle.\n2. **White Letter \"A\"**: The letter \"A\" is written in white color.\n3. **Background**: The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom.\n4. **Color Gradient**: The gradient effect is present, with the color transitioning from a lighter shade at the top to a darker shade at the bottom.", - "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "The image consists of a blue circle with the letter \"A\" written in white color. The circle is placed on a white background, which makes the letter \"A\" stand out prominently. The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom. The gradient effect gives a sense of depth and movement, making the letter \"A\" stand out more.\n\n### Description of Objects in the Image:\n1. **Circle**: The central element of the image is a blue circle.\n2. **White Letter \"A\"**: The letter \"A\" is written in white color.\n3. **Background**: The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom.\n4. **Color Gradient**: The gradient effect is present, with the color transitioning from a lighter shade at the top to a darker shade at the bottom." } }, "label": "picture", @@ -6348,8 +6348,8 @@ "content_layer": "body", "meta": { "summary": { - "text": "In this image, there is a table with two columns. The first column is labeled \"Class label,\" and the second column is labeled \"Count.\" The first row in the table has the label \"Class label,\" and the count is 22524. The second row in the table has the label \"Count,\" and the count is 204.", - "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image, there is a table with two columns. The first column is labeled \"Class label,\" and the second column is labeled \"Count.\" The first row in the table has the label \"Class label,\" and the count is 22524. The second row in the table has the label \"Count,\" and the count is 204." } }, "label": "picture", @@ -6444,8 +6444,8 @@ "content_layer": "body", "meta": { "summary": { - "text": "In this image I can see a blue circle.", - "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image I can see a blue circle." } }, "label": "picture", @@ -7261,8 +7261,8 @@ "content_layer": "body", "meta": { "summary": { - "text": "A table with different columns and rows.", - "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "A table with different columns and rows." } }, "label": "picture", @@ -7953,8 +7953,8 @@ "content_layer": "body", "meta": { "summary": { - "text": "In this image there is a table with some text on it.", - "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image there is a table with some text on it." } }, "label": "picture", diff --git a/test/data/chunker/0b_out_chunks.json b/test/data/chunker/0b_out_chunks.json index 3bb8d100..7815ffc6 100644 --- a/test/data/chunker/0b_out_chunks.json +++ b/test/data/chunker/0b_out_chunks.json @@ -15,8 +15,8 @@ "content_layer": "body", "meta": { "summary": { - "text": "In this image we can see a cartoon image of a duck holding a paper.", - "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image we can see a cartoon image of a duck holding a paper." } }, "label": "picture", @@ -946,8 +946,8 @@ "content_layer": "body", "meta": { "summary": { - "text": "In this image, we can see some text and images.", - "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image, we can see some text and images." } }, "label": "picture", @@ -3805,8 +3805,8 @@ "content_layer": "body", "meta": { "summary": { - "text": "In this image there is a table with some text on it.", - "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image there is a table with some text on it." } }, "label": "picture", @@ -4128,8 +4128,8 @@ "content_layer": "body", "meta": { "summary": { - "text": "In this image we can see a text.", - "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image we can see a text." } }, "label": "picture", @@ -4402,8 +4402,8 @@ "content_layer": "body", "meta": { "summary": { - "text": "In this image I can see the text on the image.", - "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image I can see the text on the image." } }, "label": "picture", @@ -4638,8 +4638,8 @@ "content_layer": "body", "meta": { "summary": { - "text": "In this image there is a paper with some text on it.", - "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image there is a paper with some text on it." } }, "label": "picture", @@ -5117,8 +5117,8 @@ "content_layer": "body", "meta": { "summary": { - "text": "In this image, we can see a table.", - "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image, we can see a table." } }, "label": "picture", @@ -5252,8 +5252,8 @@ "content_layer": "body", "meta": { "summary": { - "text": "The image is a line graph that shows the percentage of respondents who have used a specific type of training in the past 24 hours. The x-axis represents the time in hours, while the y-axis represents the percentage of respondents. The graph is titled \"Training.\"\n\nThe graph has two lines: one for the training type and one for the training duration. The training type line is labeled \"Training\" and the training duration line is labeled \"Training Duration.\"\n\nThe graph shows that the percentage of respondents who have used the training type increases as the time increases. Specifically, the percentage of respondents who have used the training type increases from 10% to 20% over the 24-hour period. The percentage of respondents who have used the training duration increases from 10% to 20% over the 24-hour period.\n\nThe graph also shows that the percentage of respondents who have used the training type increases as the", - "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "The image is a line graph that shows the percentage of respondents who have used a specific type of training in the past 24 hours. The x-axis represents the time in hours, while the y-axis represents the percentage of respondents. The graph is titled \"Training.\"\n\nThe graph has two lines: one for the training type and one for the training duration. The training type line is labeled \"Training\" and the training duration line is labeled \"Training Duration.\"\n\nThe graph shows that the percentage of respondents who have used the training type increases as the time increases. Specifically, the percentage of respondents who have used the training type increases from 10% to 20% over the 24-hour period. The percentage of respondents who have used the training duration increases from 10% to 20% over the 24-hour period.\n\nThe graph also shows that the percentage of respondents who have used the training type increases as the" } }, "label": "picture", @@ -5821,8 +5821,8 @@ "content_layer": "body", "meta": { "summary": { - "text": "The image consists of a blue circle with the letter \"A\" written in white color. The circle is placed on a white background, which makes the letter \"A\" stand out prominently. The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom. The gradient effect gives a sense of depth and movement, making the letter \"A\" stand out more.\n\n### Description of Objects in the Image:\n1. **Circle**: The central element of the image is a blue circle.\n2. **White Letter \"A\"**: The letter \"A\" is written in white color.\n3. **Background**: The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom.\n4. **Color Gradient**: The gradient effect is present, with the color transitioning from a lighter shade at the top to a darker shade at the bottom.", - "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "The image consists of a blue circle with the letter \"A\" written in white color. The circle is placed on a white background, which makes the letter \"A\" stand out prominently. The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom. The gradient effect gives a sense of depth and movement, making the letter \"A\" stand out more.\n\n### Description of Objects in the Image:\n1. **Circle**: The central element of the image is a blue circle.\n2. **White Letter \"A\"**: The letter \"A\" is written in white color.\n3. **Background**: The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom.\n4. **Color Gradient**: The gradient effect is present, with the color transitioning from a lighter shade at the top to a darker shade at the bottom." } }, "label": "picture", @@ -6348,8 +6348,8 @@ "content_layer": "body", "meta": { "summary": { - "text": "In this image, there is a table with two columns. The first column is labeled \"Class label,\" and the second column is labeled \"Count.\" The first row in the table has the label \"Class label,\" and the count is 22524. The second row in the table has the label \"Count,\" and the count is 204.", - "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image, there is a table with two columns. The first column is labeled \"Class label,\" and the second column is labeled \"Count.\" The first row in the table has the label \"Class label,\" and the count is 22524. The second row in the table has the label \"Count,\" and the count is 204." } }, "label": "picture", @@ -6444,8 +6444,8 @@ "content_layer": "body", "meta": { "summary": { - "text": "In this image I can see a blue circle.", - "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image I can see a blue circle." } }, "label": "picture", @@ -7261,8 +7261,8 @@ "content_layer": "body", "meta": { "summary": { - "text": "A table with different columns and rows.", - "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "A table with different columns and rows." } }, "label": "picture", @@ -7953,8 +7953,8 @@ "content_layer": "body", "meta": { "summary": { - "text": "In this image there is a table with some text on it.", - "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image there is a table with some text on it." } }, "label": "picture", diff --git a/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json b/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json index 56d18254..3663e9a0 100644 --- a/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json +++ b/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json @@ -1903,8 +1903,8 @@ "content_layer": "body", "meta": { "summary": { - "text": "In this image, we can see some text and images.", - "docling_internal__provenance": "HuggingFaceTB/SmolVLM-256M-Instruct" + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image, we can see some text and images." } }, "label": "picture", @@ -1962,7 +1962,7 @@ ], "content_layer": "body", "meta": { - "docling_internal__misc": { + "docling_legacy__misc": { "summary": "Typical Docling setup runtime characterization.", "type": "performance data" } diff --git a/test/data/doc/dummy_doc_2_prec.yaml b/test/data/doc/dummy_doc_2_prec.yaml index 643ddec0..75b2b41c 100644 --- a/test/data/doc/dummy_doc_2_prec.yaml +++ b/test/data/doc/dummy_doc_2_prec.yaml @@ -85,17 +85,17 @@ pictures: predictions: - class_name: bar_chart confidence: 0.8 - docling_internal__provenance: model1 - docling_internal__misc: + created_by: model1 + docling_legacy__misc: myanalysis: prediction: abc something_else: text: aaa molecule: - class_name: chemistry_molecular_structure confidence: 1.0 - docling_internal__provenance: model3-1.0.0 - segmentation: + created_by: model3-1.0.0 + docling_legacy__class_name: chemistry_molecular_structure + docling_legacy__segmentation: - - 0.0 - 0.0 - - 1.0 @@ -106,7 +106,7 @@ pictures: - 1.0 smi: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 summary: - docling_internal__provenance: model2 + created_by: model2 text: '...' parent: $ref: '#/body' @@ -168,10 +168,10 @@ tables: uri:  label: table meta: - docling_internal__misc: + docling_legacy__misc: foo: bar summary: - docling_internal__provenance: model3 + created_by: model3 text: A description annotation for this table. parent: $ref: '#/body' diff --git a/test/data/doc/dummy_doc_with_meta.yaml b/test/data/doc/dummy_doc_with_meta.yaml index 27ff9ae8..bb4d0296 100644 --- a/test/data/doc/dummy_doc_with_meta.yaml +++ b/test/data/doc/dummy_doc_with_meta.yaml @@ -180,7 +180,7 @@ texts: meta: summary: confidence: 0.95 - docling_internal__provenance: model1 + docling_legacy__provenance: model1 text: This is a title. my_corp__foo: More stuff here. orig: 'DocLayNet: A Large Human-Annotated Dataset for diff --git a/test/data/doc/dummy_doc_with_meta_modified.yaml b/test/data/doc/dummy_doc_with_meta_modified.yaml index ada5e142..ffc58fa4 100644 --- a/test/data/doc/dummy_doc_with_meta_modified.yaml +++ b/test/data/doc/dummy_doc_with_meta_modified.yaml @@ -85,17 +85,17 @@ pictures: predictions: - class_name: bar_chart confidence: 0.78 - docling_internal__provenance: model1 - docling_internal__misc: + created_by: model1 + docling_legacy__misc: myanalysis: prediction: abc something_else: text: aaa molecule: - class_name: chemistry_molecular_structure confidence: 0.98 - docling_internal__provenance: model3-1.0.0 - segmentation: + created_by: model3-1.0.0 + docling_legacy__class_name: chemistry_molecular_structure + docling_legacy__segmentation: - - 0.0 - 0.0 - - 1.0 @@ -106,7 +106,7 @@ pictures: - 1.0 smi: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 summary: - docling_internal__provenance: model2 + created_by: model2 text: '...' parent: $ref: '#/body' @@ -168,10 +168,10 @@ tables: uri:  label: table meta: - docling_internal__misc: + docling_legacy__misc: foo: bar summary: - docling_internal__provenance: model3 + created_by: model3 text: A description annotation for this table. parent: $ref: '#/body' @@ -215,7 +215,7 @@ texts: my_corp__foo: More stuff here. summary: confidence: 0.95 - docling_internal__provenance: model1 + docling_legacy__provenance: model1 text: This is a title. orig: 'DocLayNet: A Large Human-Annotated Dataset for diff --git a/test/test_metadata.py b/test/test_metadata.py index 48bb4a4e..0e9ec479 100644 --- a/test/test_metadata.py +++ b/test/test_metadata.py @@ -3,12 +3,7 @@ import pytest from pydantic import BaseModel -from docling_core.types.doc.document import ( - DoclingDocument, - NodeItem, - RefItem, - create_meta_field_name, -) +from docling_core.types.doc.document import DoclingDocument, NodeItem, RefItem from .test_data_gen_flag import GEN_TEST_DATA @@ -24,9 +19,11 @@ class CustomCoordinates(BaseModel): assert example_item.meta is not None # add a custom metadata object to the item - target_name = create_meta_field_name(namespace="my_corp", name="coords") value = CustomCoordinates(longitude=47.3769, latitude=8.5417) - setattr(example_item.meta, target_name, value) + target_name = example_item.meta.set_custom_field( + namespace="my_corp", name="coords", value=value + ) + assert target_name == "my_corp__coords" # save the document exp_file = src.parent / f"{src.stem}_modified.yaml" From 9f08d3563f53840cbedb2ef6193cfc9413beb61f Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Tue, 28 Oct 2025 11:46:21 +0100 Subject: [PATCH 08/22] update Markdown serialization Signed-off-by: Panos Vagenas --- docling_core/transforms/serializer/base.py | 31 +++++++ docling_core/transforms/serializer/common.py | 37 +++++++++ .../transforms/serializer/markdown.py | 81 ++++++++++++++++++- docling_core/types/doc/document.py | 24 +++++- test/data/doc/2408.09869v3_enriched.gt.md | 40 +++++++++ ...nriched_p1_include_annotations_false.gt.md | 40 +++++++++ ...3_enriched_p1_mark_annotations_false.gt.md | 40 +++++++++ ...v3_enriched_p1_mark_annotations_true.gt.md | 40 +++++++++ test/data/doc/barchart.gt.md | 2 + test/data/doc/dummy_doc.yaml.md | 8 +- test/data/doc/dummy_doc_legacy_annotations.md | 13 +++ test/data/doc/dummy_doc_mark_meta.md | 17 ++++ test/data/doc/group_with_metadata.md | 5 ++ test/data/doc/group_with_metadata.yaml | 53 ++++++++++++ test/test_metadata.py | 77 ++++++++++++++++-- 15 files changed, 497 insertions(+), 11 deletions(-) create mode 100644 test/data/doc/dummy_doc_legacy_annotations.md create mode 100644 test/data/doc/dummy_doc_mark_meta.md create mode 100644 test/data/doc/group_with_metadata.md create mode 100644 test/data/doc/group_with_metadata.yaml diff --git a/docling_core/transforms/serializer/base.py b/docling_core/transforms/serializer/base.py index dc4f2eee..67d0a727 100644 --- a/docling_core/transforms/serializer/base.py +++ b/docling_core/transforms/serializer/base.py @@ -9,6 +9,7 @@ from typing import Any, Optional, Union from pydantic import AnyUrl, BaseModel +from typing_extensions import deprecated from docling_core.types.doc.document import ( DocItem, @@ -258,6 +259,7 @@ def serialize_captions( """Serialize the item's captions.""" ... + @deprecated("Use serialize_meta() instead.") @abstractmethod def serialize_annotations( self, @@ -267,6 +269,15 @@ def serialize_annotations( """Serialize the item's annotations.""" ... + @abstractmethod + def serialize_meta( + self, + item: NodeItem, + **kwargs: Any, + ) -> SerializationResult: + """Serialize the item's meta.""" + ... + @abstractmethod def get_excluded_refs(self, **kwargs: Any) -> set[str]: """Get references to excluded items.""" @@ -287,6 +298,26 @@ def get_serializer(self, doc: DoclingDocument) -> BaseDocSerializer: ... +class BaseMetaSerializer(ABC): + """Base class for meta serializers.""" + + @abstractmethod + def serialize( + self, + *, + item: NodeItem, + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + """Serializes the meta of the passed item.""" + ... + + def _humanize_text(self, text: str, title: bool = False) -> str: + tmp = text.replace("__", "_").replace("_", " ") + return tmp.title() if title else tmp.capitalize() + + +@deprecated("Use BaseMetaSerializer() instead.") class BaseAnnotationSerializer(ABC): """Base class for annotation serializers.""" diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py index f5d80af9..7ffc4cae 100644 --- a/docling_core/transforms/serializer/common.py +++ b/docling_core/transforms/serializer/common.py @@ -4,6 +4,7 @@ # """Define base classes for serialization.""" +import logging import re import sys from abc import abstractmethod @@ -22,6 +23,7 @@ BaseInlineSerializer, BaseKeyValueSerializer, BaseListSerializer, + BaseMetaSerializer, BasePictureSerializer, BaseTableSerializer, BaseTextSerializer, @@ -56,6 +58,9 @@ _DEFAULT_LAYERS = {cl for cl in ContentLayer} +_logger = logging.getLogger(__name__) + + class _PageBreakNode(NodeItem): """Page break node.""" @@ -215,6 +220,7 @@ class DocSerializer(BaseModel, BaseDocSerializer): list_serializer: BaseListSerializer inline_serializer: BaseInlineSerializer + meta_serializer: Optional[BaseMetaSerializer] = None annotation_serializer: BaseAnnotationSerializer params: CommonParams = CommonParams() @@ -435,6 +441,13 @@ def get_parts( ) if part.text: parts.append(part) + + part = self.serialize_meta( + item=node, + **kwargs, + ) + if part.text: + parts.append(part) return parts @override @@ -528,6 +541,30 @@ def serialize_captions( text_res = "" return create_ser_result(text=text_res, span_source=results) + @override + def serialize_meta( + self, + item: NodeItem, + **kwargs: Any, + ) -> SerializationResult: + """Serialize the item's meta.""" + if self.meta_serializer: + return self.meta_serializer.serialize( + item=item, + doc=self.doc, + **kwargs, + ) + else: + _logger.warning("No meta serializer found.") + return create_ser_result( + text="", span_source=item if isinstance(item, DocItem) else [] + ) + # return create_ser_result( + # text=item.meta.model_dump_json() if item.meta else "", + # span_source=item, + # ) + + # TODO deprecate @override def serialize_annotations( self, diff --git a/docling_core/transforms/serializer/markdown.py b/docling_core/transforms/serializer/markdown.py index d0908270..28fae0a3 100644 --- a/docling_core/transforms/serializer/markdown.py +++ b/docling_core/transforms/serializer/markdown.py @@ -11,7 +11,7 @@ from pathlib import Path from typing import Any, Optional, Union -from pydantic import AnyUrl, BaseModel, PositiveInt +from pydantic import AnyUrl, BaseModel, Field, PositiveInt from tabulate import tabulate from typing_extensions import override @@ -23,6 +23,7 @@ BaseInlineSerializer, BaseKeyValueSerializer, BaseListSerializer, + BaseMetaSerializer, BasePictureSerializer, BaseTableSerializer, BaseTextSerializer, @@ -36,6 +37,7 @@ ) from docling_core.types.doc.base import ImageRefMode from docling_core.types.doc.document import ( + BaseMeta, CodeItem, ContentLayer, DescriptionAnnotation, @@ -52,14 +54,18 @@ KeyValueItem, ListGroup, ListItem, + MoleculeMetaField, NodeItem, PictureClassificationData, + PictureClassificationMetaField, PictureItem, PictureMoleculeData, PictureTabularChartData, RichTableCell, SectionHeaderItem, + SummaryMetaField, TableItem, + TabularChartMetaField, TextItem, TitleItem, ) @@ -102,8 +108,18 @@ class MarkdownParams(CommonParams): page_break_placeholder: Optional[str] = None # e.g. "" escape_underscores: bool = True escape_html: bool = True - include_annotations: bool = True - mark_annotations: bool = False + include_meta: bool = Field(default=True, description="Include item meta.") + mark_meta: bool = Field(default=False, description="Mark meta sections.") + include_annotations: bool = Field( + default=True, + description="Include item annotations.", + deprecated="Use include_meta instead.", + ) + mark_annotations: bool = Field( + default=False, + description="Mark annotation sections.", + deprecated="Use mark_meta instead.", + ) orig_list_item_marker_mode: OrigListItemMarkerMode = OrigListItemMarkerMode.AUTO ensure_valid_list_item_marker: bool = True @@ -245,9 +261,67 @@ def serialize( return create_ser_result(text=text, span_source=res_parts) +class MarkdownMetaSerializer(BaseModel, BaseMetaSerializer): + """Markdown-specific meta serializer.""" + + @override + def serialize( + self, + *, + item: NodeItem, + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + """Serialize the item's meta.""" + params = MarkdownParams(**kwargs) + return create_ser_result( + text="\n\n".join( + [ + tmp + for key in list(item.meta.__class__.model_fields) + + list(item.meta.get_custom_part()) + if ( + tmp := self._serialize_meta_field( + item.meta, key, params.mark_meta + ) + ) + is not None + ] + if params.include_meta and item.meta + else [] + ), + span_source=item if isinstance(item, DocItem) else [], + ) + + def _serialize_meta_field( + self, meta: BaseMeta, name: str, mark_meta: bool + ) -> Optional[str]: + if (field_val := getattr(meta, name)) is not None: + # NOTE: currently only considering field type, not field name + if isinstance(field_val, SummaryMetaField): + txt = field_val.text + elif isinstance(field_val, PictureClassificationMetaField): + txt = self._humanize_text(field_val.get_main_prediction().class_name) + elif isinstance(field_val, MoleculeMetaField): + txt = field_val.smi + elif isinstance(field_val, TabularChartMetaField): + # suppressing tabular chart serialization + return None + elif tmp := str(field_val or ""): + txt = tmp + else: + return None + return ( + f"[{self._humanize_text(name, title=True)}] {txt}" if mark_meta else txt + ) + else: + return None + + class MarkdownAnnotationSerializer(BaseModel, BaseAnnotationSerializer): """Markdown-specific annotation serializer.""" + @override def serialize( self, *, @@ -629,6 +703,7 @@ class MarkdownDocSerializer(DocSerializer): list_serializer: BaseListSerializer = MarkdownListSerializer() inline_serializer: BaseInlineSerializer = MarkdownInlineSerializer() + meta_serializer: BaseMetaSerializer = MarkdownMetaSerializer() annotation_serializer: BaseAnnotationSerializer = MarkdownAnnotationSerializer() params: MarkdownParams = MarkdownParams() diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index ad53a9d8..b6e8b3bd 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -4685,6 +4685,10 @@ def save_as_markdown( included_content_layers: Optional[set[ContentLayer]] = None, page_break_placeholder: Optional[str] = None, include_annotations: bool = True, + *, + include_meta: bool = True, + mark_meta: bool = False, + use_legacy_annotations: bool = False, ): """Save to markdown.""" if isinstance(filename, str): @@ -4714,6 +4718,9 @@ def save_as_markdown( included_content_layers=included_content_layers, page_break_placeholder=page_break_placeholder, include_annotations=include_annotations, + use_legacy_annotations=use_legacy_annotations, + include_meta=include_meta, + mark_meta=mark_meta, ) with open(filename, "w", encoding="utf-8") as fw: @@ -4738,6 +4745,10 @@ def export_to_markdown( # noqa: C901 page_break_placeholder: Optional[str] = None, # e.g. "", include_annotations: bool = True, mark_annotations: bool = False, + *, + include_meta: bool = True, + mark_meta: bool = False, + use_legacy_annotations: bool = False, ) -> str: r"""Serialize to Markdown. @@ -4783,6 +4794,15 @@ def export_to_markdown( # noqa: C901 :param mark_annotations: bool: Whether to mark annotations in the export; only relevant if include_annotations is True. (Default value = False). :type mark_annotations: bool = False + :param use_legacy_annotations: bool: Whether to use legacy annotation serialization. + (Default value = False). + :type use_legacy_annotations: bool = False + :param include_meta: bool: Whether to include meta in the export. + (Default value = True). + :type include_meta: bool = True + :param mark_meta: bool: Whether to mark meta in the export; only + relevant if include_meta is True. (Default value = False). + :type mark_meta: bool = False :returns: The exported Markdown representation. :rtype: str """ @@ -4813,7 +4833,9 @@ def export_to_markdown( # noqa: C901 indent=indent, wrap_width=text_width if text_width > 0 else None, page_break_placeholder=page_break_placeholder, - include_annotations=include_annotations, + include_meta=include_meta and not use_legacy_annotations, + mark_meta=mark_meta, + include_annotations=include_annotations and use_legacy_annotations, mark_annotations=mark_annotations, ), ) diff --git a/test/data/doc/2408.09869v3_enriched.gt.md b/test/data/doc/2408.09869v3_enriched.gt.md index 7669a2a8..ceeb19d1 100644 --- a/test/data/doc/2408.09869v3_enriched.gt.md +++ b/test/data/doc/2408.09869v3_enriched.gt.md @@ -1,3 +1,5 @@ +In this image we can see a cartoon image of a duck holding a paper. + @@ -8,6 +10,8 @@ In this image, we can see some text and images. +In this image, we can see some text and images. + licensing (e.g. pymupdf [7]), poor speed or unrecoverable quality issues, such as merged text cells across far-apart text tokens or table columns (pypdfium, PyPDF) [15, 14]. We therefore decided to provide multiple backend choices, and additionally open-source a custombuilt PDF parser, which is based on the low-level qpdf [4] library. It is made available in a separate package named docling-parse and powers the default PDF backend in Docling. As an alternative, we provide a PDF backend relying on pypdfium , which may be a safe backup choice in certain cases, e.g. if issues are seen with particular font encodings. @@ -56,6 +60,8 @@ Establishing GPU acceleration support for the AI models is currently work-in-pro +{'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} + machine learning through dynamic python bytecode transformation and graph compilation. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2 (ASPLOS '24) . ACM, 4 2024. doi: 10.1145/3620665.3640366. URL https://pytorch.org/assets/pytorch2-2.pdf . @@ -77,6 +83,40 @@ machine learning through dynamic python bytecode transformation and graph compil +In this image there is a table with some text on it. + +In this image we can see a text. + +In this image I can see the cover of the book. + +In this image there is a paper with some text on it. + +In this image, we can see a table with some text. + +The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. + +The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. + +### Analysis: + +#### Training Program: +- **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. +- **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%. + + +The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. + +The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. + +The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A" + +In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318. + +In this image I can see a blue circle. + +A table with different columns and rows. + +In this image there is a text in the middle. diff --git a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.md b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.md index dd345623..39e674d8 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.md +++ b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.md @@ -2,6 +2,8 @@ +In this image we can see a cartoon image of a duck holding a paper. + Version 1.0 Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar @@ -18,6 +20,8 @@ Converting PDF documents back into a machine-processable format has been a major With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models. +In this image, we can see some text and images. + torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report. Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads. @@ -28,6 +32,8 @@ Table 1: Runtime characteristics of Docling with the standard model pipeline and | Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB | | (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB | +{'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} + ## 5 Applications Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets. @@ -42,3 +48,37 @@ We encourage everyone to propose or implement additional features and models, an - [1] J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/ JaidedAI/EasyOCR , 2024. Version: 1.7.0. - [2] J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. Voznesensky, B. Bao, P. Bell, D. Berard, E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison, W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. Lazos, M. Lezcano, Y. Liang, J. Liang, Y. Lu, C. Luk, B. Maher, Y. Pan, C. Puhrsch, M. Reso, M. Saroufim, M. Y. Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang, X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S. Chintala. Pytorch 2: Faster + +In this image there is a table with some text on it. + +In this image we can see a text. + +In this image I can see the cover of the book. + +In this image there is a paper with some text on it. + +In this image, we can see a table with some text. + +The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. + +The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. + +### Analysis: + +#### Training Program: +- **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. +- **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%. + +The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. + +The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. + +The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A" + +In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318. + +In this image I can see a blue circle. + +A table with different columns and rows. + +In this image there is a text in the middle. diff --git a/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md b/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md index 61f88f35..ac188c4a 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md +++ b/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md @@ -4,6 +4,8 @@ In this image we can see a cartoon image of a duck holding a paper. +In this image we can see a cartoon image of a duck holding a paper. + Version 1.0 Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar @@ -20,6 +22,8 @@ Converting PDF documents back into a machine-processable format has been a major With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models. +In this image, we can see some text and images. + torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report. summary: Typical Docling setup runtime characterization. @@ -33,6 +37,8 @@ Table 1: Runtime characteristics of Docling with the standard model pipeline and | Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB | | (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB | +{'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} + ## 5 Applications Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets. @@ -47,3 +53,37 @@ We encourage everyone to propose or implement additional features and models, an - [1] J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/ JaidedAI/EasyOCR , 2024. Version: 1.7.0. - [2] J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. Voznesensky, B. Bao, P. Bell, D. Berard, E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison, W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. Lazos, M. Lezcano, Y. Liang, J. Liang, Y. Lu, C. Luk, B. Maher, Y. Pan, C. Puhrsch, M. Reso, M. Saroufim, M. Y. Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang, X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S. Chintala. Pytorch 2: Faster + +In this image there is a table with some text on it. + +In this image we can see a text. + +In this image I can see the cover of the book. + +In this image there is a paper with some text on it. + +In this image, we can see a table with some text. + +The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. + +The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. + +### Analysis: + +#### Training Program: +- **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. +- **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%. + +The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. + +The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. + +The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A" + +In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318. + +In this image I can see a blue circle. + +A table with different columns and rows. + +In this image there is a text in the middle. diff --git a/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_true.gt.md b/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_true.gt.md index c08732f2..55d9d60c 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_true.gt.md +++ b/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_true.gt.md @@ -4,6 +4,8 @@ +In this image we can see a cartoon image of a duck holding a paper. + Version 1.0 Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar @@ -20,6 +22,8 @@ Converting PDF documents back into a machine-processable format has been a major With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models. +In this image, we can see some text and images. + torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report. summary: Typical Docling setup runtime characterization. @@ -33,6 +37,8 @@ Table 1: Runtime characteristics of Docling with the standard model pipeline and | Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB | | (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB | +{'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} + ## 5 Applications Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets. @@ -47,3 +53,37 @@ We encourage everyone to propose or implement additional features and models, an - [1] J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/ JaidedAI/EasyOCR , 2024. Version: 1.7.0. - [2] J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. Voznesensky, B. Bao, P. Bell, D. Berard, E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison, W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. Lazos, M. Lezcano, Y. Liang, J. Liang, Y. Lu, C. Luk, B. Maher, Y. Pan, C. Puhrsch, M. Reso, M. Saroufim, M. Y. Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang, X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S. Chintala. Pytorch 2: Faster + +In this image there is a table with some text on it. + +In this image we can see a text. + +In this image I can see the cover of the book. + +In this image there is a paper with some text on it. + +In this image, we can see a table with some text. + +The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. + +The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. + +### Analysis: + +#### Training Program: +- **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. +- **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%. + +The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. + +The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. + +The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A" + +In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318. + +In this image I can see a blue circle. + +A table with different columns and rows. + +In this image there is a text in the middle. diff --git a/test/data/doc/barchart.gt.md b/test/data/doc/barchart.gt.md index 84f58652..2298833c 100644 --- a/test/data/doc/barchart.gt.md +++ b/test/data/doc/barchart.gt.md @@ -10,3 +10,5 @@ bar chart | 4 | 0.14 | 0.26 | | 5 | 0.16 | 0.25 | | 6 | 0.24 | 0.24 | + +Bar chart diff --git a/test/data/doc/dummy_doc.yaml.md b/test/data/doc/dummy_doc.yaml.md index bab71376..c018c3f4 100644 --- a/test/data/doc/dummy_doc.yaml.md +++ b/test/data/doc/dummy_doc.yaml.md @@ -2,12 +2,16 @@ Figure 1: Four examples of complex page layouts across different document categories -bar chart + ... +Bar chart + CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 - +{'myanalysis': {'prediction': 'abc'}, 'something_else': {'text': 'aaa'}} A description annotation for this table. + +{'foo': 'bar'} diff --git a/test/data/doc/dummy_doc_legacy_annotations.md b/test/data/doc/dummy_doc_legacy_annotations.md new file mode 100644 index 00000000..328545e7 --- /dev/null +++ b/test/data/doc/dummy_doc_legacy_annotations.md @@ -0,0 +1,13 @@ +# DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis + +Figure 1: Four examples of complex page layouts across different document categories + +bar chart + +... + +CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 + + + +A description annotation for this table. \ No newline at end of file diff --git a/test/data/doc/dummy_doc_mark_meta.md b/test/data/doc/dummy_doc_mark_meta.md new file mode 100644 index 00000000..8b9432e6 --- /dev/null +++ b/test/data/doc/dummy_doc_mark_meta.md @@ -0,0 +1,17 @@ +# DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis + +Figure 1: Four examples of complex page layouts across different document categories + + + +[Summary] ... + +[Classification] Bar chart + +[Molecule] CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 + +[Docling Legacy Misc] {'myanalysis': {'prediction': 'abc'}, 'something_else': {'text': 'aaa'}} + +[Summary] A description annotation for this table. + +[Docling Legacy Misc] {'foo': 'bar'} \ No newline at end of file diff --git a/test/data/doc/group_with_metadata.md b/test/data/doc/group_with_metadata.md new file mode 100644 index 00000000..d33aa19c --- /dev/null +++ b/test/data/doc/group_with_metadata.md @@ -0,0 +1,5 @@ +This part talks about foo and bar. + +Foo + +Bar \ No newline at end of file diff --git a/test/data/doc/group_with_metadata.yaml b/test/data/doc/group_with_metadata.yaml new file mode 100644 index 00000000..c4e71222 --- /dev/null +++ b/test/data/doc/group_with_metadata.yaml @@ -0,0 +1,53 @@ +body: + children: + - $ref: '#/groups/0' + - $ref: '#/texts/0' + - $ref: '#/texts/1' + content_layer: body + label: unspecified + name: _root_ + self_ref: '#/body' +form_items: [] +furniture: + children: [] + content_layer: furniture + label: unspecified + name: _root_ + self_ref: '#/furniture' +groups: +- children: [] + content_layer: body + label: unspecified + meta: + summary: + text: This part talks about foo and bar. + name: group + parent: + $ref: '#/body' + self_ref: '#/groups/0' +key_value_items: [] +name: '' +pages: {} +pictures: [] +schema_name: DoclingDocument +tables: [] +texts: +- children: [] + content_layer: body + label: text + orig: Foo + parent: + $ref: '#/body' + prov: [] + self_ref: '#/texts/0' + text: Foo +- children: [] + content_layer: body + label: text + orig: Bar + parent: + $ref: '#/body' + prov: [] + self_ref: '#/texts/1' + text: Bar +version: 1.7.0 diff --git a/test/test_metadata.py b/test/test_metadata.py index 0e9ec479..86a2be28 100644 --- a/test/test_metadata.py +++ b/test/test_metadata.py @@ -3,16 +3,24 @@ import pytest from pydantic import BaseModel -from docling_core.types.doc.document import DoclingDocument, NodeItem, RefItem +from docling_core.types.doc.document import ( + BaseMeta, + DoclingDocument, + NodeItem, + RefItem, + SummaryMetaField, +) +from docling_core.types.doc.labels import DocItemLabel from .test_data_gen_flag import GEN_TEST_DATA -def test_metadata_usage(): - class CustomCoordinates(BaseModel): - longitude: float - latitude: float +class CustomCoordinates(BaseModel): + longitude: float + latitude: float + +def test_metadata_usage(): src = Path("test/data/doc/dummy_doc_with_meta.yaml") doc = DoclingDocument.load_from_yaml(filename=src) example_item: NodeItem = RefItem(cref="#/texts/2").resolve(doc=doc) @@ -52,3 +60,62 @@ def test_namespace_absence_raises(): with pytest.raises(ValueError): example_item.meta.my_corp_programmaticaly_added_field = True + + +def _create_doc_with_group_with_metadata() -> DoclingDocument: + doc = DoclingDocument(name="") + grp = doc.add_group() + grp.meta = BaseMeta( + summary=SummaryMetaField(text="This part talks about foo and bar.") + ) + doc.add_text(text="Foo", label=DocItemLabel.TEXT) + doc.add_text(text="Bar", label=DocItemLabel.TEXT) + return doc + + +def test_group_with_metadata(): + doc = _create_doc_with_group_with_metadata() + + # test dumping to and loading from YAML + exp_file = Path("test/data/doc/group_with_metadata.yaml") + if GEN_TEST_DATA: + doc.save_as_yaml(filename=exp_file) + else: + expected = DoclingDocument.load_from_yaml(filename=exp_file) + assert doc == expected + + # test exporting to Markdown + exp_file = exp_file.with_suffix(".md") + if GEN_TEST_DATA: + doc.save_as_markdown(filename=exp_file) + else: + actual = doc.export_to_markdown() + with open(exp_file, "r", encoding="utf-8") as f: + expected = f.read() + assert actual == expected + + +def test_legacy_annotations(): + inp = Path("test/data/doc/dummy_doc.yaml") + doc = DoclingDocument.load_from_yaml(filename=inp) + exp_file = inp.parent / f"{inp.stem}_legacy_annotations.md" + if GEN_TEST_DATA: + doc.save_as_markdown(filename=exp_file, use_legacy_annotations=True) + else: + actual = doc.export_to_markdown(use_legacy_annotations=True) + with open(exp_file, "r", encoding="utf-8") as f: + expected = f.read() + assert actual == expected + + +def test_mark_meta(): + inp = Path("test/data/doc/dummy_doc.yaml") + doc = DoclingDocument.load_from_yaml(filename=inp) + exp_file = inp.parent / f"{inp.stem}_mark_meta.md" + if GEN_TEST_DATA: + doc.save_as_markdown(filename=exp_file, mark_meta=True) + else: + actual = doc.export_to_markdown(mark_meta=True) + with open(exp_file, "r", encoding="utf-8") as f: + expected = f.read() + assert actual == expected From c3466e32c037c5c23e0de3f2429bd98de260945d Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Tue, 28 Oct 2025 16:53:19 +0100 Subject: [PATCH 09/22] revert description, add include_non_meta, showcase custom serializer for summaries Signed-off-by: Panos Vagenas --- docling_core/transforms/serializer/common.py | 45 +- .../transforms/serializer/markdown.py | 15 +- docling_core/types/doc/document.py | 40 +- docs/DoclingDocument.json | 99 +- test/data/chunker/0_out_chunks.json | 91 +- test/data/chunker/0b_out_chunks.json | 91 +- test/data/doc/2206.01062.yaml.pages.dt | 108 +++ test/data/doc/2408.09869v3_enriched.gt.md | 352 +++++++- ...iched_p1_include_annotations_false.gt.html | 162 ++++ ...nriched_p1_include_annotations_false.gt.md | 340 ++++++- ...riched_p1_include_annotations_true.gt.html | 185 ++++ ...3_enriched_p1_mark_annotations_false.gt.md | 376 +++++++- ...v3_enriched_p1_mark_annotations_true.gt.md | 376 +++++++- .../2408.09869v3_enriched_p2_p3_p5.gt.json | 2 +- test/data/doc/activities.gt.md | 2 + test/data/doc/activities_p1.gt.html | 7 + test/data/doc/activities_p2.gt.html | 23 + test/data/doc/activities_p2.gt.md | 30 + test/data/doc/activities_pb_empty.gt.md | 2 + test/data/doc/activities_pb_non_empty.gt.md | 2 + test/data/doc/activities_pb_none.gt.md | 2 + test/data/doc/barchart.gt.md | 4 +- test/data/doc/checkboxes.gt.md | 6 + test/data/doc/cross_page_lists_chunks.json | 843 +++++++++++++++++- test/data/doc/dummy_doc.yaml.md | 8 +- test/data/doc/dummy_doc_2_prec.yaml | 12 +- test/data/doc/dummy_doc_legacy_annotations.md | 13 - test/data/doc/dummy_doc_mark_meta.md | 12 +- .../doc/dummy_doc_with_meta_modified.yaml | 12 +- test/data/doc/group_with_metadata.md | 5 - test/data/doc/group_with_metadata.yaml | 98 +- test/data/doc/group_with_metadata_default.md | 22 + test/data/doc/group_with_metadata_marked.md | 22 + .../data/doc/group_with_metadata_summaries.md | 6 + test/test_metadata.py | 178 +++- 35 files changed, 3266 insertions(+), 325 deletions(-) delete mode 100644 test/data/doc/dummy_doc_legacy_annotations.md delete mode 100644 test/data/doc/group_with_metadata.md create mode 100644 test/data/doc/group_with_metadata_default.md create mode 100644 test/data/doc/group_with_metadata_marked.md create mode 100644 test/data/doc/group_with_metadata_summaries.md diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py index 7ffc4cae..b08ee88c 100644 --- a/docling_core/transforms/serializer/common.py +++ b/docling_core/transforms/serializer/common.py @@ -85,7 +85,7 @@ def _iterate_items( my_visited: set[str] = visited if visited is not None else set() prev_page_nr: Optional[int] = None page_break_i = 0 - for item, _ in doc.iterate_items( + for item, lvl in doc.iterate_items( root=node, with_groups=True, included_content_layers=layers, @@ -98,7 +98,7 @@ def _iterate_items( ): # if group starts with new page, yield page break before group node my_visited.add(item.self_ref) - for it in _iterate_items( + for it, _ in _iterate_items( doc=doc, layers=layers, node=item, @@ -113,7 +113,7 @@ def _iterate_items( self_ref=f"#/pb/{page_break_i}", prev_page=prev_page_nr, next_page=page_no, - ) + ), lvl break elif isinstance(item, DocItem) and item.prov: page_no = item.prov[0].page_no @@ -123,10 +123,10 @@ def _iterate_items( self_ref=f"#/pb/{page_break_i}", prev_page=prev_page_nr, next_page=page_no, - ) + ), lvl page_break_i += 1 prev_page_nr = page_no - yield item + yield item, lvl def _get_annotation_text( @@ -193,10 +193,21 @@ class CommonParams(BaseModel): start_idx: NonNegativeInt = 0 stop_idx: NonNegativeInt = sys.maxsize + include_non_meta: bool = True + include_formatting: bool = True include_hyperlinks: bool = True caption_delim: str = " " + # allowed_meta_names: Optional[set[str]] = Field( + # default=None, + # description="Names of meta fields to include; if None, all fields will be included.", + # ) + # blocked_meta_names: set[str] = Field( + # default_factory=set, + # description="Names of meta fields to block; takes precedence over allowed_meta_names.", + # ) + def merge_with_patch(self, patch: dict[str, Any]) -> Self: """Create an instance by merging the provided patch dict on top of self.""" res = self.model_copy(update=patch) @@ -422,7 +433,7 @@ def get_parts( my_visited: set[str] = visited if visited is not None else set() params = self.params.merge_with_patch(patch=kwargs) - for node in _iterate_items( + for node, lvl in _iterate_items( node=item, doc=self.doc, layers=params.layers, @@ -432,22 +443,26 @@ def get_parts( continue else: my_visited.add(node.self_ref) - part = self.serialize( - item=node, - list_level=list_level, - is_inline_scope=is_inline_scope, - visited=my_visited, - **kwargs, - ) - if part.text: - parts.append(part) part = self.serialize_meta( item=node, + level=lvl, **kwargs, ) if part.text: parts.append(part) + + if params.include_non_meta: + part = self.serialize( + item=node, + list_level=list_level, + is_inline_scope=is_inline_scope, + visited=my_visited, + **kwargs, + ) + if part.text: + parts.append(part) + return parts @override diff --git a/docling_core/transforms/serializer/markdown.py b/docling_core/transforms/serializer/markdown.py index 28fae0a3..52fe3209 100644 --- a/docling_core/transforms/serializer/markdown.py +++ b/docling_core/transforms/serializer/markdown.py @@ -41,6 +41,7 @@ CodeItem, ContentLayer, DescriptionAnnotation, + DescriptionMetaField, DocItem, DocItemLabel, DoclingDocument, @@ -108,7 +109,7 @@ class MarkdownParams(CommonParams): page_break_placeholder: Optional[str] = None # e.g. "" escape_underscores: bool = True escape_html: bool = True - include_meta: bool = Field(default=True, description="Include item meta.") + # include_meta: bool = Field(default=True, description="Include item meta.") mark_meta: bool = Field(default=False, description="Mark meta sections.") include_annotations: bool = Field( default=True, @@ -278,16 +279,17 @@ def serialize( text="\n\n".join( [ tmp - for key in list(item.meta.__class__.model_fields) - + list(item.meta.get_custom_part()) + for key in ( + list(item.meta.__class__.model_fields) + + list(item.meta.get_custom_part()) + ) if ( tmp := self._serialize_meta_field( item.meta, key, params.mark_meta ) ) - is not None ] - if params.include_meta and item.meta + if item.meta else [] ), span_source=item if isinstance(item, DocItem) else [], @@ -297,9 +299,10 @@ def _serialize_meta_field( self, meta: BaseMeta, name: str, mark_meta: bool ) -> Optional[str]: if (field_val := getattr(meta, name)) is not None: - # NOTE: currently only considering field type, not field name if isinstance(field_val, SummaryMetaField): txt = field_val.text + elif isinstance(field_val, DescriptionMetaField): + txt = field_val.text elif isinstance(field_val, PictureClassificationMetaField): txt = self._humanize_text(field_val.get_main_prediction().class_name) elif isinstance(field_val, MoleculeMetaField): diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index b6e8b3bd..2c4a7c80 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -1023,6 +1023,12 @@ class BaseMeta(_ExtraAllowingModel): summary: Optional[SummaryMetaField] = None +class DescriptionMetaField(BasePrediction): + """Description metadata field.""" + + text: str + + class PictureClassificationPrediction(BasePrediction): """Picture classification instance.""" @@ -1062,7 +1068,13 @@ class TabularChartMetaField(BasePrediction): chart_data: TableData -class PictureMeta(BaseMeta): +class FloatingMeta(BaseMeta): + """Metadata model for floating.""" + + description: Optional[DescriptionMetaField] = None + + +class PictureMeta(FloatingMeta): """Metadata model for pictures.""" classification: Optional[PictureClassificationMetaField] = None @@ -1179,8 +1191,6 @@ def _add_sibling( class GroupItem(NodeItem): # Container type, can't be a leaf node """GroupItem.""" - meta: Optional[BaseMeta] = None - name: str = ( "group" # Name of the group, e.g. "Introduction Chapter", # "Slide 5", "Navigation menu list", ... @@ -1231,7 +1241,6 @@ class DocItem( label: DocItemLabel prov: List[ProvenanceItem] = [] - meta: Optional[BaseMeta] = None def get_location_tokens( self, @@ -1446,6 +1455,8 @@ class ListItem(TextItem): class FloatingItem(DocItem): """FloatingItem.""" + meta: Optional[FloatingMeta] = None + captions: List[RefItem] = [] references: List[RefItem] = [] footnotes: List[RefItem] = [] @@ -1601,11 +1612,10 @@ def _migrate_annotations_to_meta(cls, data: Any) -> Any: ], ).model_dump(mode="json"), ) - # migrate description annotation to summary meta field elif isinstance(ann, DescriptionAnnotation): data["meta"].setdefault( - "summary", - SummaryMetaField( + "description", + DescriptionMetaField( text=ann.text, created_by=ann.provenance, ).model_dump(mode="json"), @@ -1822,11 +1832,10 @@ def migrate_annotations_to_meta(cls, data: Any) -> Any: # ensure meta field is present data.setdefault("meta", {}) - # migrate description annotation to summary meta field if isinstance(ann, DescriptionAnnotation): data["meta"].setdefault( - "summary", - SummaryMetaField( + "description", + DescriptionMetaField( text=ann.text, created_by=ann.provenance, ).model_dump(mode="json"), @@ -4686,7 +4695,6 @@ def save_as_markdown( page_break_placeholder: Optional[str] = None, include_annotations: bool = True, *, - include_meta: bool = True, mark_meta: bool = False, use_legacy_annotations: bool = False, ): @@ -4719,7 +4727,6 @@ def save_as_markdown( page_break_placeholder=page_break_placeholder, include_annotations=include_annotations, use_legacy_annotations=use_legacy_annotations, - include_meta=include_meta, mark_meta=mark_meta, ) @@ -4746,7 +4753,6 @@ def export_to_markdown( # noqa: C901 include_annotations: bool = True, mark_annotations: bool = False, *, - include_meta: bool = True, mark_meta: bool = False, use_legacy_annotations: bool = False, ) -> str: @@ -4797,11 +4803,8 @@ def export_to_markdown( # noqa: C901 :param use_legacy_annotations: bool: Whether to use legacy annotation serialization. (Default value = False). :type use_legacy_annotations: bool = False - :param include_meta: bool: Whether to include meta in the export. - (Default value = True). - :type include_meta: bool = True :param mark_meta: bool: Whether to mark meta in the export; only - relevant if include_meta is True. (Default value = False). + relevant if use_legacy_annotations is False. (Default value = False). :type mark_meta: bool = False :returns: The exported Markdown representation. :rtype: str @@ -4833,7 +4836,8 @@ def export_to_markdown( # noqa: C901 indent=indent, wrap_width=text_width if text_width > 0 else None, page_break_placeholder=page_break_placeholder, - include_meta=include_meta and not use_legacy_annotations, + # allowed_meta_names=set() if use_legacy_annotations else allowed_meta_names, + # blocked_meta_names=blocked_meta_names or set(), mark_meta=mark_meta, include_annotations=include_annotations and use_legacy_annotations, mark_annotations=mark_annotations, diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index 92528c4b..f1c957cf 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -216,7 +216,7 @@ "meta": { "anyOf": [ { - "$ref": "#/$defs/BaseMeta" + "$ref": "#/$defs/FloatingMeta" }, { "type": "null" @@ -433,6 +433,56 @@ "title": "DescriptionAnnotation", "type": "object" }, + "DescriptionMetaField": { + "additionalProperties": true, + "description": "Description metadata field.", + "properties": { + "confidence": { + "anyOf": [ + { + "maximum": 1, + "minimum": 0, + "type": "number" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The confidence of the prediction.", + "examples": [ + 0.9, + 0.42 + ], + "title": "Confidence" + }, + "created_by": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The origin of the prediction.", + "examples": [ + "ibm-granite/granite-docling-258M" + ], + "title": "Created By" + }, + "text": { + "title": "Text", + "type": "string" + } + }, + "required": [ + "text" + ], + "title": "DescriptionMetaField", + "type": "object" + }, "DocumentOrigin": { "description": "FileSource.", "properties": { @@ -473,6 +523,36 @@ "title": "DocumentOrigin", "type": "object" }, + "FloatingMeta": { + "additionalProperties": true, + "description": "Metadata model for floating.", + "properties": { + "summary": { + "anyOf": [ + { + "$ref": "#/$defs/SummaryMetaField" + }, + { + "type": "null" + } + ], + "default": null + }, + "description": { + "anyOf": [ + { + "$ref": "#/$defs/DescriptionMetaField" + }, + { + "type": "null" + } + ], + "default": null + } + }, + "title": "FloatingMeta", + "type": "object" + }, "FormItem": { "additionalProperties": false, "description": "FormItem.", @@ -508,7 +588,7 @@ "meta": { "anyOf": [ { - "$ref": "#/$defs/BaseMeta" + "$ref": "#/$defs/FloatingMeta" }, { "type": "null" @@ -1039,7 +1119,7 @@ "meta": { "anyOf": [ { - "$ref": "#/$defs/BaseMeta" + "$ref": "#/$defs/FloatingMeta" }, { "type": "null" @@ -1743,6 +1823,17 @@ ], "default": null }, + "description": { + "anyOf": [ + { + "$ref": "#/$defs/DescriptionMetaField" + }, + { + "type": "null" + } + ], + "default": null + }, "classification": { "anyOf": [ { @@ -2417,7 +2508,7 @@ "meta": { "anyOf": [ { - "$ref": "#/$defs/BaseMeta" + "$ref": "#/$defs/FloatingMeta" }, { "type": "null" diff --git a/test/data/chunker/0_out_chunks.json b/test/data/chunker/0_out_chunks.json index a5c1b6df..d3630d3a 100644 --- a/test/data/chunker/0_out_chunks.json +++ b/test/data/chunker/0_out_chunks.json @@ -13,12 +13,7 @@ }, "children": [], "content_layer": "body", - "meta": { - "summary": { - "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image we can see a cartoon image of a duck holding a paper." - } - }, + "meta": {}, "label": "picture", "prov": [ { @@ -944,12 +939,7 @@ } ], "content_layer": "body", - "meta": { - "summary": { - "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image, we can see some text and images." - } - }, + "meta": {}, "label": "picture", "prov": [ { @@ -3803,12 +3793,7 @@ } ], "content_layer": "body", - "meta": { - "summary": { - "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image there is a table with some text on it." - } - }, + "meta": {}, "label": "picture", "prov": [ { @@ -4126,12 +4111,7 @@ } ], "content_layer": "body", - "meta": { - "summary": { - "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image we can see a text." - } - }, + "meta": {}, "label": "picture", "prov": [ { @@ -4400,12 +4380,7 @@ } ], "content_layer": "body", - "meta": { - "summary": { - "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image I can see the text on the image." - } - }, + "meta": {}, "label": "picture", "prov": [ { @@ -4636,12 +4611,7 @@ } ], "content_layer": "body", - "meta": { - "summary": { - "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image there is a paper with some text on it." - } - }, + "meta": {}, "label": "picture", "prov": [ { @@ -5115,12 +5085,7 @@ }, "children": [], "content_layer": "body", - "meta": { - "summary": { - "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image, we can see a table." - } - }, + "meta": {}, "label": "picture", "prov": [ { @@ -5250,12 +5215,7 @@ }, "children": [], "content_layer": "body", - "meta": { - "summary": { - "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "The image is a line graph that shows the percentage of respondents who have used a specific type of training in the past 24 hours. The x-axis represents the time in hours, while the y-axis represents the percentage of respondents. The graph is titled \"Training.\"\n\nThe graph has two lines: one for the training type and one for the training duration. The training type line is labeled \"Training\" and the training duration line is labeled \"Training Duration.\"\n\nThe graph shows that the percentage of respondents who have used the training type increases as the time increases. Specifically, the percentage of respondents who have used the training type increases from 10% to 20% over the 24-hour period. The percentage of respondents who have used the training duration increases from 10% to 20% over the 24-hour period.\n\nThe graph also shows that the percentage of respondents who have used the training type increases as the" - } - }, + "meta": {}, "label": "picture", "prov": [ { @@ -5819,12 +5779,7 @@ } ], "content_layer": "body", - "meta": { - "summary": { - "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "The image consists of a blue circle with the letter \"A\" written in white color. The circle is placed on a white background, which makes the letter \"A\" stand out prominently. The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom. The gradient effect gives a sense of depth and movement, making the letter \"A\" stand out more.\n\n### Description of Objects in the Image:\n1. **Circle**: The central element of the image is a blue circle.\n2. **White Letter \"A\"**: The letter \"A\" is written in white color.\n3. **Background**: The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom.\n4. **Color Gradient**: The gradient effect is present, with the color transitioning from a lighter shade at the top to a darker shade at the bottom." - } - }, + "meta": {}, "label": "picture", "prov": [ { @@ -6346,12 +6301,7 @@ } ], "content_layer": "body", - "meta": { - "summary": { - "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image, there is a table with two columns. The first column is labeled \"Class label,\" and the second column is labeled \"Count.\" The first row in the table has the label \"Class label,\" and the count is 22524. The second row in the table has the label \"Count,\" and the count is 204." - } - }, + "meta": {}, "label": "picture", "prov": [ { @@ -6442,12 +6392,7 @@ } ], "content_layer": "body", - "meta": { - "summary": { - "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image I can see a blue circle." - } - }, + "meta": {}, "label": "picture", "prov": [ { @@ -7259,12 +7204,7 @@ } ], "content_layer": "body", - "meta": { - "summary": { - "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "A table with different columns and rows." - } - }, + "meta": {}, "label": "picture", "prov": [ { @@ -7951,12 +7891,7 @@ } ], "content_layer": "body", - "meta": { - "summary": { - "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image there is a table with some text on it." - } - }, + "meta": {}, "label": "picture", "prov": [ { diff --git a/test/data/chunker/0b_out_chunks.json b/test/data/chunker/0b_out_chunks.json index 7815ffc6..6620ef1e 100644 --- a/test/data/chunker/0b_out_chunks.json +++ b/test/data/chunker/0b_out_chunks.json @@ -13,12 +13,7 @@ }, "children": [], "content_layer": "body", - "meta": { - "summary": { - "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image we can see a cartoon image of a duck holding a paper." - } - }, + "meta": {}, "label": "picture", "prov": [ { @@ -944,12 +939,7 @@ } ], "content_layer": "body", - "meta": { - "summary": { - "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image, we can see some text and images." - } - }, + "meta": {}, "label": "picture", "prov": [ { @@ -3803,12 +3793,7 @@ } ], "content_layer": "body", - "meta": { - "summary": { - "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image there is a table with some text on it." - } - }, + "meta": {}, "label": "picture", "prov": [ { @@ -4126,12 +4111,7 @@ } ], "content_layer": "body", - "meta": { - "summary": { - "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image we can see a text." - } - }, + "meta": {}, "label": "picture", "prov": [ { @@ -4400,12 +4380,7 @@ } ], "content_layer": "body", - "meta": { - "summary": { - "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image I can see the text on the image." - } - }, + "meta": {}, "label": "picture", "prov": [ { @@ -4636,12 +4611,7 @@ } ], "content_layer": "body", - "meta": { - "summary": { - "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image there is a paper with some text on it." - } - }, + "meta": {}, "label": "picture", "prov": [ { @@ -5115,12 +5085,7 @@ }, "children": [], "content_layer": "body", - "meta": { - "summary": { - "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image, we can see a table." - } - }, + "meta": {}, "label": "picture", "prov": [ { @@ -5250,12 +5215,7 @@ }, "children": [], "content_layer": "body", - "meta": { - "summary": { - "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "The image is a line graph that shows the percentage of respondents who have used a specific type of training in the past 24 hours. The x-axis represents the time in hours, while the y-axis represents the percentage of respondents. The graph is titled \"Training.\"\n\nThe graph has two lines: one for the training type and one for the training duration. The training type line is labeled \"Training\" and the training duration line is labeled \"Training Duration.\"\n\nThe graph shows that the percentage of respondents who have used the training type increases as the time increases. Specifically, the percentage of respondents who have used the training type increases from 10% to 20% over the 24-hour period. The percentage of respondents who have used the training duration increases from 10% to 20% over the 24-hour period.\n\nThe graph also shows that the percentage of respondents who have used the training type increases as the" - } - }, + "meta": {}, "label": "picture", "prov": [ { @@ -5819,12 +5779,7 @@ } ], "content_layer": "body", - "meta": { - "summary": { - "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "The image consists of a blue circle with the letter \"A\" written in white color. The circle is placed on a white background, which makes the letter \"A\" stand out prominently. The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom. The gradient effect gives a sense of depth and movement, making the letter \"A\" stand out more.\n\n### Description of Objects in the Image:\n1. **Circle**: The central element of the image is a blue circle.\n2. **White Letter \"A\"**: The letter \"A\" is written in white color.\n3. **Background**: The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom.\n4. **Color Gradient**: The gradient effect is present, with the color transitioning from a lighter shade at the top to a darker shade at the bottom." - } - }, + "meta": {}, "label": "picture", "prov": [ { @@ -6346,12 +6301,7 @@ } ], "content_layer": "body", - "meta": { - "summary": { - "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image, there is a table with two columns. The first column is labeled \"Class label,\" and the second column is labeled \"Count.\" The first row in the table has the label \"Class label,\" and the count is 22524. The second row in the table has the label \"Count,\" and the count is 204." - } - }, + "meta": {}, "label": "picture", "prov": [ { @@ -6442,12 +6392,7 @@ } ], "content_layer": "body", - "meta": { - "summary": { - "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image I can see a blue circle." - } - }, + "meta": {}, "label": "picture", "prov": [ { @@ -7259,12 +7204,7 @@ } ], "content_layer": "body", - "meta": { - "summary": { - "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "A table with different columns and rows." - } - }, + "meta": {}, "label": "picture", "prov": [ { @@ -7951,12 +7891,7 @@ } ], "content_layer": "body", - "meta": { - "summary": { - "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", - "text": "In this image there is a table with some text on it." - } - }, + "meta": {}, "label": "picture", "prov": [ { diff --git a/test/data/doc/2206.01062.yaml.pages.dt b/test/data/doc/2206.01062.yaml.pages.dt index 44f29e05..95b52cc7 100644 --- a/test/data/doc/2206.01062.yaml.pages.dt +++ b/test/data/doc/2206.01062.yaml.pages.dt @@ -43,10 +43,118 @@ DocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape of labeled, rectangular boundingboxes. We define 11 distinct labels for layout features, namely Caption , Footnote , Formula List-item , , Page-footer , Page-header , Picture , Section-header , Table , Text , and Title . Our reasoning for picking this particular label set is detailed in Section 4. In addition to open intellectual property constraints for the source documents, we required that the documents in DocLayNet adhere to a few conditions. Firstly, we kept scanned documents +DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis +KDD '22, August 14-18, 2022, Washington, DC, USA +Figure 2: Distribution of DocLayNet pages across document categories. +to a minimum, since they introduce difficulties in annotation (see Section 4). As a second condition, we focussed on medium to large documents ( > 10 pages) with technical content, dense in complex tables, figures, plots and captions. Such documents carry a lot of information value, but are often hard to analyse with high accuracy due to their challenging layouts. Counterexamples of documents not included in the dataset are receipts, invoices, hand-written documents or photographs showing 'text in the wild". +The pages in DocLayNet can be grouped into six distinct categories, namely Financial Reports , Manuals Scientific Articles , , Laws & Regulations , Patents and Government Tenders . Each document category was sourced from various repositories. For example, Financial Reports contain both free-style format annual reports 2 which expose company-specific, artistic layouts as well as the more formal SEC filings. The two largest categories ( Financial Reports and Manuals ) contain a large amount of free-style layouts in order to obtain maximum variability. In the other four categories, we boosted the variability by mixing documents from independent providers, such as different government websites or publishers. In Figure 2, we show the document categories contained in DocLayNet with their respective sizes. +We did not control the document selection with regard to language. The vast majority of documents contained in DocLayNet (close to 95%) are published in English language. However, DocLayNet also contains a number of documents in other languages such as German (2.5%), French (1.0%) and Japanese (1.0%). While the document language has negligible impact on the performance of computer vision methods such as object detection and segmentation models, it might prove challenging for layout analysis methods which exploit textual features. +To ensure that future benchmarks in the document-layout analysis community can be easily compared, we have split up DocLayNet into pre-defined train-, test- and validation-sets. In this way, we can avoid spurious variations in the evaluation scores due to random splitting in train-, test- and validation-sets. We also ensured that less frequent labels are represented in train and test sets in equal proportions. +2 e.g. AAPL from https://www.annualreports.com/ +Table 1 shows the overall frequency and distribution of the labels among the different sets. Importantly, we ensure that subsets are only split on full-document boundaries. This avoids that pages of the same document are spread over train, test and validation set, which can give an undesired evaluation advantage to models and lead to overestimation of their prediction accuracy. We will show the impact of this decision in Section 5. +In order to accommodate the different types of models currently in use by the community, we provide DocLayNet in an augmented COCO format [16]. This entails the standard COCO ground-truth file (in JSON format) with the associated page images (in PNG format, 1025 × 1025 pixels). Furthermore, custom fields have been added to each COCO record to specify document category, original document filename and page number. In addition, we also provide the original PDF pages, as well as sidecar files containing parsed PDF text and text-cell coordinates (in JSON). All additional files are linked to the primary page images by their matching filenames. +Despite being cost-intense and far less scalable than automation, human annotation has several benefits over automated groundtruth generation. The first and most obvious reason to leverage human annotations is the freedom to annotate any type of document without requiring a programmatic source. For most PDF documents, the original source document is not available. The latter is not a hard constraint with human annotation, but it is for automated methods. A second reason to use human annotations is that the latter usually provide a more natural interpretation of the page layout. The human-interpreted layout can significantly deviate from the programmatic layout used in typesetting. For example, 'invisible' tables might be used solely for aligning text paragraphs on columns. Such typesetting tricks might be interpreted by automated methods incorrectly as an actual table, while the human annotation will interpret it correctly as Text or other styles. The same applies to multi-line text elements, when authors decided to space them as 'invisible' list elements without bullet symbols. A third reason to gather ground-truth through human annotation is to estimate a 'natural' upper bound on the segmentation accuracy. As we will show in Section 4, certain documents featuring complex layouts can have different but equally acceptable layout interpretations. This natural upper bound for segmentation accuracy can be found by annotating the same pages multiple times by different people and evaluating the inter-annotator agreement. Such a baseline consistency evaluation is very useful to define expectations for a good target accuracy in trained deep neural network models and avoid overfitting (see Table 1). On the flip side, achieving high annotation consistency proved to be a key challenge in human annotation, as we outline in Section 4. +4 ANNOTATION CAMPAIGN +The annotation campaign was carried out in four phases. In phase one, we identified and prepared the data sources for annotation. In phase two, we determined the class labels and how annotations should be done on the documents in order to obtain maximum consistency. The latter was guided by a detailed requirement analysis and exhaustive experiments. In phase three, we trained the annotation staff and performed exams for quality assurance. In phase four, +KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar +% of Totaltriple inter-annotator mAP @ 0.5-0.95 (%)class labelCountTrainTestValAllFinManSciLawPatTenCaption225242.041.772.3284-8940-6186-9294-9995-9969-78n/aFootnote63180.600.310.5883-91n/a10062-8885-94n/a82-97Formula250272.251.902.9683-85n/an/a84-8786-96n/an/aList-item18566017.1913.3415.8287-8874-8390-9297-9781-8575-8893-95Page-footer708786.515.586.0093-9488-9095-9610092-9710096-98Page-header580225.106.705.0685-8966-7690-9498-10091-9297-9981-86Picture459764.212.785.3169-7156-5982-8669-8280-9566-7159-76Section-header14288412.6015.7712.8583-8476-8190-9294-9587-9469-7378-86Table347333.202.273.6077-8175-8083-8698-9958-8079-8470-85Text51037745.8249.2845.0084-8681-8688-9389-9387-9271-7987-95Title50710.470.300.5060-7224-6350-6394-10082-9668-7924-56Total1107470941123998166653182-8371-7479-8189-9486-9171-7668-85Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges. +Figure 3: Corpus Conversion Service annotation user interface. The PDF page is shown in the background, with overlaid text-cells (in darker shades). The annotation boxes can be drawn by dragging a rectangle over each segment with the respective label from the palette on the right. +we distributed the annotation workload and performed continuous quality controls. Phase one and two required a small team of experts only. For phases three and four, a group of 40 dedicated annotators were assembled and supervised. +Phase 1: Data selection and preparation. Our inclusion criteria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources include publication repositories such as arXiv 3 , government offices, company websites as well as data directory services for financial reports and patents. Scanned documents were excluded wherever possible because they can be rotated or skewed. This would not allow us to perform annotation with rectangular bounding-boxes and therefore complicate the annotation process. +Preparation work included uploading and parsing the sourced PDF documents in the Corpus Conversion Service (CCS) [22], a cloud-native platform which provides a visual annotation interface and allows for dataset inspection and analysis. The annotation interface of CCS is shown in Figure 3. The desired balance of pages between the different document categories was achieved by selective subsampling of pages with certain desired properties. For example, we made sure to include the title page of each document and bias the remaining page selection to those with figures or tables. The latter was achieved by leveraging pre-trained object detection models from PubLayNet, which helped us estimate how many figures and tables a given page contains. +Phase 2: Label selection and guideline. We reviewed the collected documents and identified the most common structural features they exhibit. This was achieved by identifying recurrent layout elements and lead us to the definition of 11 distinct class labels. These 11 class labels are Caption , Footnote , Formula List-item , , Pagefooter , Page-header , Picture , Section-header , Table , Text , and Title . Critical factors that were considered for the choice of these class labels were (1) the overall occurrence of the label, (2) the specificity of the label, (3) recognisability on a single page (i.e. no need for context from previous or next page) and (4) overall coverage of the page. Specificity ensures that the choice of label is not ambiguous, while coverage ensures that all meaningful items on a page can be annotated. We refrained from class labels that are very specific to a document category, such as Abstract in the Scientific Articles category. We also avoided class labels that are tightly linked to the semantics of the text. Labels such as Author and Affiliation , as seen in DocBank, are often only distinguishable by discriminating on +3 https://arxiv.org/ +DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis +KDD '22, August 14-18, 2022, Washington, DC, USA +the textual content of an element, which goes beyond visual layout recognition, in particular outside the Scientific Articles category. +At first sight, the task of visual document-layout interpretation appears intuitive enough to obtain plausible annotations in most cases. However, during early trial-runs in the core team, we observed many cases in which annotators use different annotation styles, especially for documents with challenging layouts. For example, if a figure is presented with subfigures, one annotator might draw a single figure bounding-box, while another might annotate each subfigure separately. The same applies for lists, where one might annotate all list items in one block or each list item separately. In essence, we observed that challenging layouts would be annotated in different but plausible ways. To illustrate this, we show in Figure 4 multiple examples of plausible but inconsistent annotations on the same pages. +Obviously, this inconsistency in annotations is not desirable for datasets which are intended to be used for model training. To minimise these inconsistencies, we created a detailed annotation guideline. While perfect consistency across 40 annotation staff members is clearly not possible to achieve, we saw a huge improvement in annotation consistency after the introduction of our annotation guideline. A few selected, non-trivial highlights of the guideline are: +(1) Every list-item is an individual object instance with class label List-item . This definition is different from PubLayNet and DocBank, where all list-items are grouped together into one List object. +(2) A List-item is a paragraph with hanging indentation. Singleline elements can qualify as List-item if the neighbour elements expose hanging indentation. Bullet or enumeration symbols are not a requirement. +(3) For every Caption , there must be exactly one corresponding Picture or Table . +(4) Connected sub-pictures are grouped together in one Picture object. +(5) Formula numbers are included in a Formula object. +(6) Emphasised text (e.g. in italic or bold) at the beginning of a paragraph is not considered a Section-header , unless it appears exclusively on its own line. + +The complete annotation guideline is over 100 pages long and a detailed description is obviously out of scope for this paper. Nevertheless, it will be made publicly available alongside with DocLayNet for future reference. +Phase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore we prepared a subset of pages with two different complexity levels, each with a practice and an exam part. 974 pages were reference-annotated by one proficient core team member. Annotation staff were then given the task to annotate the same subsets (blinded from the reference). By comparing the annotations of each staff member with the reference annotations, we could quantify how closely their annotations matched the reference. Only after passing two exam levels with high annotation quality, staff were admitted into the production phase. Practice iterations + +05237a14f2524e3f53c8454b074409d05078038a6a36b770fcc8ec7e540deae0 +Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous. +were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially allocated annotators did not pass the bar. +Phase 4: Production annotation. The previously selected 80K pages were annotated with the defined 11 class labels by 32 annotators. This production phase took around three months to complete. All annotations were created online through CCS, which visualises the programmatic PDF text-cells as an overlay on the page. The page annotation are obtained by drawing rectangular bounding-boxes, as shown in Figure 3. With regard to the annotation practices, we implemented a few constraints and capabilities on the tooling level. First, we only allow non-overlapping, vertically oriented, rectangular boxes. For the large majority of documents, this constraint was sufficient and it speeds up the annotation considerably in comparison with arbitrary segmentation shapes. Second, annotator staff were not able to see each other's annotations. This was enforced by design to avoid any bias in the annotation, which could skew the numbers of the inter-annotator agreement (see Table 1). We wanted +KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar +Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset. +humanMRCNNFRCNNYOLOR50R101R101v5x6Caption84-8968.471.570.177.7Footnote83-9170.971.873.777.2Formula83-8560.163.463.566.2List-item87-8881.280.881.086.2Page-footer93-9461.659.358.961.1Page-header85-8971.970.072.067.9Picture69-7171.772.772.077.1Section-header83-8467.669.368.474.6Table77-8182.282.982.286.3Text84-8684.685.885.488.1Title60-7276.780.479.982.7All82-8372.473.573.476.8 +to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity. +5 EXPERIMENTS +The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this +Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions. +paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work. +In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16]. +Baselines for Object Detection +In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 × 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document. +DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis +KDD '22, August 14-18, 2022, Washington, DC, USA +Table 3: Performance of a Mask R-CNN R50 network in mAP@0.5-0.95 scores trained on DocLayNet with different class label sets. The reduced label sets were obtained by either down-mapping or dropping labels. +Class-count11654Caption68TextTextTextFootnote71TextTextTextFormula60TextTextTextList-item81Text82TextPage-footer6262--Page-header7268--Picture72727272Section-header68676968Table82838282Text85848484Title77Sec.-h.Sec.-h.Sec.-h.Overall72737877 +Learning Curve +One of the fundamental questions related to any dataset is if it is 'large enough'. To answer this question for DocLayNet, we performed a data ablation study in which we evaluated a Mask R-CNN model trained on increasing fractions of the DocLayNet dataset. As can be seen in Figure 5, the mAP score rises sharply in the beginning and eventually levels out. To estimate the error-bar on the metrics, we ran the training five times on the entire data-set. This resulted in a 1% error-bar, depicted by the shaded area in Figure 5. In the inset of Figure 5, we show the exact same data-points, but with a logarithmic scale on the x-axis. As is expected, the mAP score increases linearly as a function of the data-size in the inset. The curve ultimately flattens out between the 80% and 100% mark, with the 80% mark falling within the error-bars of the 100% mark. This provides a good indication that the model would not improve significantly by yet increasing the data size. Rather, it would probably benefit more from improved data consistency (as discussed in Section 3), data augmentation methods [23], or the addition of more document categories and styles. +Impact of Class Labels +The choice and number of labels can have a significant effect on the overall model performance. Since PubLayNet, DocBank and DocLayNet all have different label sets, it is of particular interest to understand and quantify this influence of the label set on the model performance. We investigate this by either down-mapping labels into more common ones (e.g. Caption → Text ) or excluding them from the annotations entirely. Furthermore, it must be stressed that all mappings and exclusions were performed on the data before model training. In Table 3, we present the mAP scores for a Mask R-CNN R50 network on different label sets. Where a label is down-mapped, we show its corresponding label, otherwise it was excluded. We present three different label sets, with 6, 5 and 4 different labels respectively. The set of 5 labels contains the same labels as PubLayNet. However, due to the different definition of +Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise split for different label sets. Naive page-wise split will result in /tildelow 10% point improvement. +Class-count115SplitDocPageDocPageCaption6883Footnote7184Formula6066List-item81888288Page-footer6289Page-header7290Picture72827282Section-header68836983Table82898290Text85918490Title7781All72847887 +lists in PubLayNet (grouped list-items) versus DocLayNet (separate list-items), the label set of size 4 is the closest to PubLayNet, in the assumption that the List is down-mapped to Text in PubLayNet. The results in Table 3 show that the prediction accuracy on the remaining class labels does not change significantly when other classes are merged into them. The overall macro-average improves by around 5%, in particular when Page-footer and Page-header are excluded. +Impact of Document Split in Train and Test Set +Many documents in DocLayNet have a unique styling. In order to avoid overfitting on a particular style, we have split the train-, test- and validation-sets of DocLayNet on document boundaries, i.e. every document contributes pages to only one set. To the best of our knowledge, this was not considered in PubLayNet or DocBank. To quantify how this affects model performance, we trained and evaluated a Mask R-CNN R50 model on a modified dataset version. Here, the train-, test- and validation-sets were obtained by a randomised draw over the individual pages. As can be seen in Table 4, the difference in model performance is surprisingly large: pagewise splitting gains ˜ 0% in mAP over the document-wise splitting. 1 Thus, random page-wise splitting of DocLayNet can easily lead to accidental overestimation of model performance and should be avoided. +Dataset Comparison +Throughout this paper, we claim that DocLayNet's wider variety of document layouts leads to more robust layout detection models. In Table 5, we provide evidence for that. We trained models on each of the available datasets (PubLayNet, DocBank and DocLayNet) and evaluated them on the test sets of the other datasets. Due to the different label sets and annotation styles, a direct comparison is not possible. Hence, we focussed on the common labels among the datasets. Between PubLayNet and DocLayNet, these are Picture , +KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar +Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets. +Testing onTraining onlabelsPLNDBDLNPubLayNet (PLN)Figure964323Sec-header87-32Table952449Text96-42total933430DocBank (DB)Figure777131Table196522total486827DocLayNet (DLN)Figure675172Sec-header53-68Table874382Text77-84total594778 +Section-header , Table and Text . Before training, we either mapped or excluded DocLayNet's other labels as specified in table 3, and also PubLayNet's List to Text . Note that the different clustering of lists (by list-element vs. whole list objects) naturally decreases the mAP score for Text . +For comparison of DocBank with DocLayNet, we trained only on Picture and Table clusters of each dataset. We had to exclude Text because successive paragraphs are often grouped together into a single object in DocBank. This paragraph grouping is incompatible with the individual paragraphs of DocLayNet. As can be seen in Table 5, DocLayNet trained models yield better performance compared to the previous datasets. It is noteworthy that the models trained on PubLayNet and DocBank perform very well on their own test set, but have a much lower performance on the foreign datasets. While this also applies to DocLayNet, the difference is far less pronounced. Thus we conclude that DocLayNet trained models are overall more robust and will produce better results for challenging, unseen layouts. +Example Predictions +To conclude this section, we illustrate the quality of layout predictions one can expect from DocLayNet-trained models by providing a selection of examples without any further post-processing applied. Figure 6 shows selected layout predictions on pages from the test-set of DocLayNet. Results look decent in general across document categories, however one can also observe mistakes such as overlapping clusters of different classes, or entirely missing boxes due to low confidence. +6 CONCLUSION +In this paper, we presented the DocLayNet dataset. It provides the document conversion and layout analysis research community a new and challenging dataset to improve and fine-tune novel ML methods on. In contrast to many other datasets, DocLayNet was created by human annotation in order to obtain reliable layout ground-truth on a wide variety of publication- and typesettingstyles. Including a large proportion of documents outside the scientific publishing domain adds significant value in this respect. +From the dataset, we have derived on the one hand reference metrics for human performance on document-layout annotation (through double and triple annotations) and on the other hand evaluated the baseline performance of commonly used object detection methods. We also illustrated the impact of various dataset-related aspects on model performance through data-ablation experiments, both from a size and class-label perspective. Last but not least, we compared the accuracy of models trained on other public datasets and showed that DocLayNet trained models are more robust. +To date, there is still a significant gap between human and ML accuracy on the layout interpretation task, and we hope that this work will inspire the research community to close that gap. +REFERENCES +[1] Max Göbel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. +[2] Christian Clausner, Apostolos Antonacopoulos, and Stefan Pletschacher. Icdar2017 competition on recognition of documents with complex layouts rdcl2017. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 1404-1410, 2017. +[3] Hervé Déjean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), April 2019. http://sac.founderit.com/. +[4] Antonio Jimeno Yepes, Peter Zhong, and Douglas Burdick. Competition on scientific literature parsing. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 605-617. LNCS 12824, SpringerVerlag, sep 2021. +[5] Logan Markewich, Hao Zhang, Yubin Xing, Navid Lambert-Shirzad, Jiang Zhexin, Roy Lee, Zhi Li, and Seok-Bum Ko. Segmentation for document layout analysis: not dead yet. International Journal on Document Analysis and Recognition (IJDAR) , pages 1-11, 01 2022. +[6] Xu Zhong, Jianbin Tang, and Antonio Jimeno-Yepes. Publaynet: Largest dataset ever for document layout analysis. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 1015-1022, sep 2019. +[7] Minghao Li, Yiheng Xu, Lei Cui, Shaohan Huang, Furu Wei, Zhoujun Li, and Ming Zhou. Docbank: A benchmark dataset for document layout analysis. In Proceedings of the 28th International Conference on Computational Linguistics , COLING, pages 949-960. International Committee on Computational Linguistics, dec 2020. +[8] Riaz Ahmad, Muhammad Tanvir Afzal, and M. Qadir. Information extraction from pdf sources based on rule-based system using integrated formats. In SemWebEval@ESWC , 2016. +[9] Ross B. Girshick, Jeff Donahue, Trevor Darrell, and Jitendra Malik. Rich feature hierarchies for accurate object detection and semantic segmentation. In IEEE Conference on Computer Vision and Pattern Recognition , CVPR, pages 580-587. IEEE Computer Society, jun 2014. +[10] Ross B. Girshick. Fast R-CNN. In 2015 IEEE International Conference on Computer Vision , ICCV, pages 1440-1448. IEEE Computer Society, dec 2015. +[11] Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. Faster r-cnn: Towards real-time object detection with region proposal networks. IEEE Transactions on Pattern Analysis and Machine Intelligence , 39(6):1137-1149, 2017. +[12] Kaiming He, Georgia Gkioxari, Piotr Dollár, and Ross B. Girshick. Mask R-CNN. In IEEE International Conference on Computer Vision , ICCV, pages 2980-2988. IEEE Computer Society, Oct 2017. +[13] Glenn Jocher, Alex Stoken, Ayush Chaurasia, Jirka Borovec, NanoCode012, TaoXie, Yonghye Kwon, Kalen Michael, Liu Changyu, Jiacong Fang, Abhiram V, Laughing, tkianai, yxNONG, Piotr Skalski, Adam Hogan, Jebastin Nadar, imyhxy, Lorenzo Mammana, Alex Wang, Cristi Fati, Diego Montes, Jan Hajek, Laurentiu + +DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis +KDD '22, August 14-18, 2022, Washington, DC, USA +Text Caption List-Item Formula Table Section-Header Picture Page-Header Page-Footer Title +Figure 6: Example layout predictions on selected pages from the DocLayNet test-set. (A, D) exhibit favourable results on coloured backgrounds. (B, C) show accurate list-item and paragraph differentiation despite densely-spaced lines. (E) demonstrates good table and figure distinction. (F) shows predictions on a Chinese patent with multiple overlaps, label confusion and missing boxes. +Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang. ultralytics/yolov5: v6.0 - yolov5n nano models, roboflow integration, tensorflow export, opencv dnn support, October 2021. +[20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021. +[14] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers. CoRR , abs/2005.12872, 2020. +[15] Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and efficient object detection. CoRR , abs/1911.09070, 2019. +[16] Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Dollár, and C. Lawrence Zitnick. Microsoft COCO: common objects in context, 2014. +[17] Yuxin Wu, Alexander Kirillov, Francisco Massa, Wan-Yen Lo, and Ross Girshick. Detectron2, 2019. +[18] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter W. J. Staar. Robust pdf document conversion using recurrent neural networks. In Proceedings of the 35th Conference on Artificial Intelligence , AAAI, pages 1513715145, feb 2021. +[19] Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou. Layoutlm: Pre-training of text and layout for document image understanding. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 1192-1200, New York, USA, 2020. Association for Computing Machinery. +[21] Peng Zhang, Can Li, Liang Qiao, Zhanzhan Cheng, Shiliang Pu, Yi Niu, and Fei Wu. Vsr: A unified framework for document layout analysis combining vision, semantics and relations, 2021. +[22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 774-782. ACM, 2018. +[23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation for deep learning. Journal of Big Data , 6(1):60, 2019. + diff --git a/test/data/doc/2408.09869v3_enriched.gt.md b/test/data/doc/2408.09869v3_enriched.gt.md index ceeb19d1..f916b9c1 100644 --- a/test/data/doc/2408.09869v3_enriched.gt.md +++ b/test/data/doc/2408.09869v3_enriched.gt.md @@ -1,17 +1,74 @@ +# Docling Technical Report + +In this image we can see a cartoon image of a duck holding a paper. + In this image we can see a cartoon image of a duck holding a paper. + + +Version 1.0 + +Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar + +AI4K Group, IBM Research R¨ uschlikon, Switzerland + +## Abstract + +This technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source package for PDF document conversion. It is powered by state-of-the-art specialized AI models for layout analysis (DocLayNet) and table structure recognition (TableFormer), and runs efficiently on commodity hardware in a small resource budget. The code interface allows for easy extensibility and addition of new features and models. + +## 1 Introduction + +Converting PDF documents back into a machine-processable format has been a major challenge for decades due to their huge variability in formats, weak standardization and printing-optimized characteristic, which discards most structural features and metadata. With the advent of LLMs and popular application patterns such as retrieval-augmented generation (RAG), leveraging the rich content embedded in PDFs has become ever more relevant. In the past decade, several powerful document understanding solutions have emerged on the market, most of which are commercial software, cloud offerings [3] and most recently, multi-modal vision-language models. As of today, only a handful of open-source tools cover PDF conversion, leaving a significant feature and quality gap to proprietary solutions. + +With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models. + +Here is what Docling delivers today: + +- Converts PDF documents to JSON or Markdown format, stable and lightning fast +- Understands detailed page layout, reading order, locates figures and recovers table structures +- Extracts metadata from the document, such as title, authors, references and language +- Optionally applies OCR, e.g. for scanned PDFs +- Can be configured to be optimal for batch-mode (i.e high throughput, low time-to-solution) or interactive mode (compromise on efficiency, low time-to-solution) +- Can leverage different accelerators (GPU, MPS, etc). + +## 2 Getting Started + +To use Docling, you can simply install the docling package from PyPI. Documentation and examples are available in our GitHub repository at [github.com/DS4SD/docling](https://github.com/DS4SD/docling) . All required model assets 1 are downloaded to a local huggingface datasets cache on first use, unless you choose to pre-install the model assets in advance. + +Docling provides an easy code interface to convert PDF documents from file system, URLs or binary streams, and retrieve the output in either JSON or Markdown format. For convenience, separate methods are offered to convert single documents or batches of documents. A basic usage example is illustrated below. Further examples are available in the Doclign code repository. + +``` +from docling.document_converter import DocumentConverter Large +``` + +``` +source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL converter = DocumentConverter() result = converter.convert_single(source) print(result.render_as_markdown()) # output: "## DocLayNet: A Human -Annotated Dataset for Document -Layout Analysis [...]" +``` + +Optionally, you can configure custom pipeline features and runtime options, such as turning on or off features (e.g. OCR, table structure recognition), enforcing limits on the input document size, and defining the budget of CPU threads. Advanced usage examples and options are documented in the README file. Docling also provides a Dockerfile to demonstrate how to install and run it inside a container. + +## 3 Processing pipeline + +Docling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieves the programmatic text tokens, consisting of string content and its coordinates on the page, and also renders a bitmap image of each page to support downstream operations. Then, the standard model pipeline applies a sequence of AI models independently on every page in the document to extract features and content, such as layout and table structures. Finally, the results from all pages are aggregated and passed through a post-processing stage, which augments metadata, detects the document language, infers reading-order and eventually assembles a typed document object which can be serialized to JSON or Markdown. + +## 3.1 PDF backends + +Two basic requirements to process PDF documents in our pipeline are a) to retrieve all text content and their geometric coordinates on each page and b) to render the visual representation of each page as it would appear in a PDF viewer. Both these requirements are encapsulated in Docling's PDF backend interface. While there are several open-source PDF parsing libraries available for python, we faced major obstacles with all of them for different reasons, among which were restrictive + +1 see huggingface.co/ds4sd/docling-models/ + +In this image, we can see some text and images. + Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible. In this image, we can see some text and images. -In this image, we can see some text and images. - licensing (e.g. pymupdf [7]), poor speed or unrecoverable quality issues, such as merged text cells across far-apart text tokens or table columns (pypdfium, PyPDF) [15, 14]. We therefore decided to provide multiple backend choices, and additionally open-source a custombuilt PDF parser, which is based on the low-level qpdf [4] library. It is made available in a separate package named docling-parse and powers the default PDF backend in Docling. As an alternative, we provide a PDF backend relying on pypdfium , which may be a safe backup choice in certain cases, e.g. if issues are seen with particular font encodings. @@ -60,8 +117,33 @@ Establishing GPU acceleration support for the AI models is currently work-in-pro +torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report. + {'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} +Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads. + +| CPU | Thread budget | native backend | native backend | native backend | pypdfium backend | pypdfium backend | pypdfium backend | +|----------------------------------|-----------------|------------------|------------------|------------------|--------------------|--------------------|--------------------| +| | | TTS | Pages/s | Mem | TTS | Pages/s | Mem | +| Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB | +| (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB | + +## 5 Applications + +Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets. + +## 6 Future work and contributions + +Docling is designed to allow easy extension of the model library and pipelines. In the future, we plan to extend Docling with several more models, such as a figure-classifier model, an equationrecognition model, a code-recognition model and more. This will help improve the quality of conversion for specific types of content, as well as augment extracted document metadata with additional information. Further investment into testing and optimizing GPU acceleration as well as improving the Docling-native PDF backend are on our roadmap, too. + +We encourage everyone to propose or implement additional features and models, and will gladly take your inputs and contributions under review . The codebase of Docling is open for use and contribution, under the MIT license agreement and in alignment with our contributing guidelines included in the Docling repository. If you use Docling in your projects, please consider citing this technical report. + +## References + +- [1] J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/ JaidedAI/EasyOCR , 2024. Version: 1.7.0. +- [2] J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. Voznesensky, B. Bao, P. Bell, D. Berard, E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison, W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. Lazos, M. Lezcano, Y. Liang, J. Liang, Y. Lu, C. Luk, B. Maher, Y. Pan, C. Puhrsch, M. Reso, M. Saroufim, M. Y. Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang, X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S. Chintala. Pytorch 2: Faster + machine learning through dynamic python bytecode transformation and graph compilation. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2 (ASPLOS '24) . ACM, 4 2024. doi: 10.1145/3620665.3640366. URL https://pytorch.org/assets/pytorch2-2.pdf . @@ -83,18 +165,136 @@ machine learning through dynamic python bytecode transformation and graph compil +## Appendix + +In this section, we illustrate a few examples of Docling's output in Markdown and JSON. + +## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis + +## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis + +Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com + +Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com + +Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com + +Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com + +Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com + +## ABSTRACT + +Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large ground-truth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis. + +## CCS CONCEPTS + +· Informationsystems → Documentstructure ; · Appliedcomputing → Document analysis ; · Computing methodologies → Machine learning Computer vision ; ; Object detection ; + +Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s). KDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043 + +Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com + +Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com + +Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com + +Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com + +Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com + +## ABSTRACT + +Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large groundtruth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis. + +## CCS CONCEPTS + +Æ Information systems → Document structure ; Æ Applied computing → Document analysis ; Æ Computing methodologies → Machine learning ; Computer vision ; Object detection ; + +Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s). + +KDD '22, August 14-18, 2022, Washington, DC, USA ' 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043 + +Figure 1: Four examples of complex page layouts across different document categories + +## KEYWORDS + +PDF document conversion, layout segmentation, object-detection, data set, Machine Learning + +## ACM Reference Format: + +Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043 + In this image there is a table with some text on it. +In this image there is a table with some text on it. + + + +In this image we can see a text. + In this image we can see a text. + + +AGL Energy Limited ABN 74 1 + +5 061 375 + +In this image I can see the cover of the book. + In this image I can see the cover of the book. + + +In this image there is a paper with some text on it. + In this image there is a paper with some text on it. + + +Figure 1: Four examples of complex page layouts across different document categories + +## KEYWORDS + +PDF document conversion, layout segmentation, object-detection, data set, Machine Learning + +## ACMReference Format: + +Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043 + +1 INTRODUCTION + +Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1. Figure 2: Title page of the DocLayNet paper (arxiv.org/pdf/2206.01062) - left PDF, right rendered Markdown. If recognized, metadata such as authors are appearing first under the title. Text content inside figures is currently dropped, the caption is retained and linked to the figure in the JSON representation (not shown). + +KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar + +Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset. + +| | human | MRCNN R50 R101 | FRCNN R101 | YOLO v5x6 | +|--------------------------------------------------------------------------------------------------------|-------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------|--------------------------------------------------------| +| Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All | 84-89 83-91 83-85 87-88 93-94 85-89 69-71 83-84 77-81 84-86 | 68.4 71.5 70.9 60.1 63.4 81.2 80.8 61.6 59.3 71.9 70.0 71.7 72.7 67.6 69.3 82.2 82.9 85.8 76.7 80.4 72.4 73.5 | 70.1 73.7 63.5 81.0 58.9 72.0 72.0 68.4 82.2 85.4 79.9 73.4 | 77.7 77.2 66.2 86.2 61.1 67.9 74.6 86.3 88.1 82.7 76.8 | + +to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity. + +## 5 EXPERIMENTS + +The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this + +In this image, we can see a table with some text. + In this image, we can see a table with some text. + + +Third, achienec + +## EXPERIMENTS + +chalenongayouls ground-vuth dawa such WC + The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. @@ -105,18 +305,166 @@ The graph has two lines: one for the training program and one for the percentage - **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. - **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%. +The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. + +The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. + +### Analysis: + +#### Training Program: +- **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. +- **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%. + + + +Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions. + +paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work. + +In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16]. + +## Baselines for Object Detection + +In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 × 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document. + +coioct dcochon modols + +## Baselines for Object Detection + +mak enbrel + +Figure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table. + +KDD '22, August 14-18, 2022, Washington, DC, USA + +Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar + +Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % + +between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges. + +of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric + +The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. + +The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. + +The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A" + The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A" + + +In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318. + In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318. + + +| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | +|----------------|---------|--------------|--------------|--------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------| +| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten | +| Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a | +| Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 | +| Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a | +| List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 | +| Page-footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 | +| Page-header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 | +| Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 | +| Section-header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 | +| Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 | +| Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 | +| Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 | +| Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 | + +In this image I can see a blue circle. + In this image I can see a blue circle. + + +include publication repositories such as arXiv + +Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple- + +annotated pages, from which we obtain accuracy ranges. + +A table with different columns and rows. + A table with different columns and rows. + + +| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | +|-----------------|---------|--------------|--------------|--------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------| +| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten | +| Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a | +| Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 | +| Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a | +| List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 | +| Page- footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 | +| Page- header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 | +| Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 | +| Section- header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 | +| Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 | +| Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 | +| Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 | +| Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 | + +3 + +, + +government offices, + +We reviewed the col- + +, + +Page- + +Title and + +. + +page. Specificity ensures that the choice of label is not ambiguous, + +In this image there is a text in the middle. + In this image there is a text in the middle. + + + +we distributed the annotation workload and performed continuous be annotated. We refrained from class labels that are very specific + +only. For phases three and four, a group of 40 dedicated annotators while coverage ensures that all meaningful items on a page can + +quality controls. Phase one and two required a small team of experts to a document category, such as + +Abstract in the + +Scientific Articles were assembled and supervised. + +category. We also avoided class labels that are tightly linked to the + +Phase 1: Data selection and preparation. + +Our inclusion cri- + +Author + +Affiliation + +teria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources in DocBank, are often only distinguishable by discriminating on 3 https://arxiv.org/ Figure 4: Table 1 from the DocLayNet paper in the original PDF (A), as rendered Markdown (B) and in JSON representation (C). Spanning table cells, such as the multi-column header 'triple interannotator mAP@0.5-0.95 (%)', is repeated for each column in the Markdown representation (B), which guarantees that every data point can be traced back to row and column headings only by its grid coordinates in the table. In the JSON representation, the span information is reflected in the fields of each table cell (C). + +semantics of the text. Labels such as and + +, + +as seen diff --git a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.html b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.html index 3e166869..96c6750b 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.html +++ b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.html @@ -12,6 +12,168 @@

Abstract

1 Introduction

Converting PDF documents back into a machine-processable format has been a major challenge for decades due to their huge variability in formats, weak standardization and printing-optimized characteristic, which discards most structural features and metadata. With the advent of LLMs and popular application patterns such as retrieval-augmented generation (RAG), leveraging the rich content embedded in PDFs has become ever more relevant. In the past decade, several powerful document understanding solutions have emerged on the market, most of which are commercial software, cloud offerings [3] and most recently, multi-modal vision-language models. As of today, only a handful of open-source tools cover PDF conversion, leaving a significant feature and quality gap to proprietary solutions.

With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models.

+

Here is what Docling delivers today:

+
    +
  • Converts PDF documents to JSON or Markdown format, stable and lightning fast
  • +
  • Understands detailed page layout, reading order, locates figures and recovers table structures
  • +
  • Extracts metadata from the document, such as title, authors, references and language
  • +
  • Optionally applies OCR, e.g. for scanned PDFs
  • +
  • Can be configured to be optimal for batch-mode (i.e high throughput, low time-to-solution) or interactive mode (compromise on efficiency, low time-to-solution)
  • +
  • Can leverage different accelerators (GPU, MPS, etc).
  • +
+

2 Getting Started

+To use Docling, you can simply install the docling package from PyPI. Documentation and examples are available in our GitHub repository at github.com/DS4SD/docling . All required model assets 1 are downloaded to a local huggingface datasets cache on first use, unless you choose to pre-install the model assets in advance. +

Docling provides an easy code interface to convert PDF documents from file system, URLs or binary streams, and retrieve the output in either JSON or Markdown format. For convenience, separate methods are offered to convert single documents or batches of documents. A basic usage example is illustrated below. Further examples are available in the Doclign code repository.

+
from docling.document_converter import DocumentConverter Large
+
source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL converter = DocumentConverter() result = converter.convert_single(source) print(result.render_as_markdown()) # output: "## DocLayNet: A Human -Annotated Dataset for Document -Layout Analysis [...]"
+

Optionally, you can configure custom pipeline features and runtime options, such as turning on or off features (e.g. OCR, table structure recognition), enforcing limits on the input document size, and defining the budget of CPU threads. Advanced usage examples and options are documented in the README file. Docling also provides a Dockerfile to demonstrate how to install and run it inside a container.

+

3 Processing pipeline

+

Docling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieves the programmatic text tokens, consisting of string content and its coordinates on the page, and also renders a bitmap image of each page to support downstream operations. Then, the standard model pipeline applies a sequence of AI models independently on every page in the document to extract features and content, such as layout and table structures. Finally, the results from all pages are aggregated and passed through a post-processing stage, which augments metadata, detects the document language, infers reading-order and eventually assembles a typed document object which can be serialized to JSON or Markdown.

+

3.1 PDF backends

+

Two basic requirements to process PDF documents in our pipeline are a) to retrieve all text content and their geometric coordinates on each page and b) to render the visual representation of each page as it would appear in a PDF viewer. Both these requirements are encapsulated in Docling's PDF backend interface. While there are several open-source PDF parsing libraries available for python, we faced major obstacles with all of them for different reasons, among which were restrictive

+

1 see huggingface.co/ds4sd/docling-models/

+
Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.
+

licensing (e.g. pymupdf [7]), poor speed or unrecoverable quality issues, such as merged text cells across far-apart text tokens or table columns (pypdfium, PyPDF) [15, 14].

+

We therefore decided to provide multiple backend choices, and additionally open-source a custombuilt PDF parser, which is based on the low-level qpdf [4] library. It is made available in a separate package named docling-parse and powers the default PDF backend in Docling. As an alternative, we provide a PDF backend relying on pypdfium , which may be a safe backup choice in certain cases, e.g. if issues are seen with particular font encodings.

+

3.2 AI models

+

As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks.

+

Layout Analysis Model

+

Our layout analysis model is an object-detector which predicts the bounding-boxes and classes of various elements on the image of a given page. Its architecture is derived from RT-DETR [16] and re-trained on DocLayNet [13], our popular human-annotated dataset for document-layout analysis, among other proprietary datasets. For inference, our implementation relies on the onnxruntime [5].

+

The Docling pipeline feeds page images at 72 dpi resolution, which can be processed on a single CPU with sub-second latency. All predicted bounding-box proposals for document elements are post-processed to remove overlapping proposals based on confidence and size, and then intersected with the text tokens in the PDF to group them into meaningful and complete units such as paragraphs, section titles, list items, captions, figures or tables.

+

Table Structure Recognition

+

The TableFormer model [12], first published in 2022 and since refined with a custom structure token language [9], is a vision-transformer model for table structure recovery. It can predict the logical row and column structure of a given table based on an input image, and determine which table cells belong to column headers, row headers or the table body. Compared to earlier approaches, TableFormer handles many characteristics of tables, such as partial or no borderlines, empty cells, rows or columns, cell spans and hierarchy both on column-heading or row-heading level, tables with inconsistent indentation or alignment and other complexities. For inference, our implementation relies on PyTorch [2].

+

The Docling pipeline feeds all table objects detected in the layout analysis to the TableFormer model, by providing an image-crop of the table and the included text cells. TableFormer structure predictions are matched back to the PDF cells in post-processing to avoid expensive re-transcription text in the table image. Typical tables require between 2 and 6 seconds to be processed on a standard CPU, strongly depending on the amount of included table cells.

+

OCR

+

Docling provides optional support for OCR, for example to cover scanned PDFs or content in bitmaps images embedded on a page. In our initial release, we rely on EasyOCR [1], a popular thirdparty OCR library with support for many languages. Docling, by default, feeds a high-resolution page image (216 dpi) to the OCR engine, to allow capturing small print detail in decent quality. While EasyOCR delivers reasonable transcription quality, we observe that it runs fairly slow on CPU (upwards of 30 seconds per page).

+

We are actively seeking collaboration from the open-source community to extend Docling with additional OCR backends and speed improvements.

+

3.3 Assembly

+

In the final pipeline stage, Docling assembles all prediction results produced on each page into a well-defined datatype that encapsulates a converted document, as defined in the auxiliary package docling-core . The generated document object is passed through a post-processing model which leverages several algorithms to augment features, such as detection of the document language, correcting the reading order, matching figures with captions and labelling metadata such as title, authors and references. The final output can then be serialized to JSON or transformed into a Markdown representation at the users request.

+

3.4 Extensibility

+

Docling provides a straight-forward interface to extend its capabilities, namely the model pipeline. A model pipeline constitutes the central part in the processing, following initial document parsing and preceding output assembly, and can be fully customized by sub-classing from an abstract baseclass ( BaseModelPipeline ) or cloning the default model pipeline. This effectively allows to fully customize the chain of models, add or replace models, and introduce additional pipeline configuration parameters. To use a custom model pipeline, the custom pipeline class to instantiate can be provided as an argument to the main document conversion methods. We invite everyone in the community to propose additional or alternative models and improvements.

+

Implementations of model classes must satisfy the python Callable interface. The __call__ method must accept an iterator over page objects, and produce another iterator over the page objects which were augmented with the additional features predicted by the model, by extending the provided PagePredictions data model accordingly.

+

4 Performance

+

In this section, we establish some reference numbers for the processing speed of Docling and the resource budget it requires. All tests in this section are run with default options on our standard test set distributed with Docling, which consists of three papers from arXiv and two IBM Redbooks, with a total of 225 pages. Measurements were taken using both available PDF backends on two different hardware systems: one MacBook Pro M3 Max, and one bare-metal server running Ubuntu 20.04 LTS on an Intel Xeon E5-2690 CPU. For reproducibility, we fixed the thread budget (through setting OMP NUM THREADS environment variable ) once to 4 (Docling default) and once to 16 (equal to full core count on the test hardware). All results are shown in Table 1.

+

If you need to run Docling in very low-resource environments, please consider configuring the pypdfium backend. While it is faster and more memory efficient than the default docling-parse backend, it will come at the expense of worse quality results, especially in table structure recovery.

+

Establishing GPU acceleration support for the AI models is currently work-in-progress and largely untested, but may work implicitly when CUDA is available and discovered by the onnxruntime and

+

torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report.

+
Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads.
CPUThread budgetnative backendpypdfium backend
TTSPages/sMemTTSPages/sMem
Apple M3 Max4177 s 167 s1.27 1.346.20 GB103 s 92 s2.18 2.452.56 GB
(16 cores) Intel(R) Xeon E5-269016 4 16375 s 244 s0.60 0.926.16 GB239 s 143 s0.94 1.572.42 GB
+

5 Applications

+

Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets.

+

6 Future work and contributions

+

Docling is designed to allow easy extension of the model library and pipelines. In the future, we plan to extend Docling with several more models, such as a figure-classifier model, an equationrecognition model, a code-recognition model and more. This will help improve the quality of conversion for specific types of content, as well as augment extracted document metadata with additional information. Further investment into testing and optimizing GPU acceleration as well as improving the Docling-native PDF backend are on our roadmap, too.

+

We encourage everyone to propose or implement additional features and models, and will gladly take your inputs and contributions under review . The codebase of Docling is open for use and contribution, under the MIT license agreement and in alignment with our contributing guidelines included in the Docling repository. If you use Docling in your projects, please consider citing this technical report.

+

References

+
    +
  1. J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/ JaidedAI/EasyOCR , 2024. Version: 1.7.0.
  2. +
  3. J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. Voznesensky, B. Bao, P. Bell, D. Berard, E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison, W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. Lazos, M. Lezcano, Y. Liang, J. Liang, Y. Lu, C. Luk, B. Maher, Y. Pan, C. Puhrsch, M. Reso, M. Saroufim, M. Y. Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang, X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S. Chintala. Pytorch 2: Faster
  4. +
+

machine learning through dynamic python bytecode transformation and graph compilation. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2 (ASPLOS '24) . ACM, 4 2024. doi: 10.1145/3620665.3640366. URL https://pytorch.org/assets/pytorch2-2.pdf .

+
    +
  1. C. Auer, M. Dolfi, A. Carvalho, C. B. Ramis, and P. W. Staar. Delivering document conversion as a cloud service with high throughput and responsiveness. In 2022 IEEE 15th International Conference on Cloud Computing (CLOUD) , pages 363-373. IEEE, 2022.
  2. +
  3. J. Berkenbilt. Qpdf: A content-preserving pdf document transformer, 2024. URL https: //github.com/qpdf/qpdf .
  4. +
  5. O. R. developers. Onnx runtime. https://onnxruntime.ai/ , 2024. Version: 1.18.1.
  6. +
  7. IBM. Data Prep Kit: a community project to democratize and accelerate unstructured data preparation for LLM app developers, 2024. URL https://github.com/IBM/ data-prep-kit .
  8. +
  9. A. S. Inc. PyMuPDF, 2024. URL https://github.com/pymupdf/PyMuPDF .
  10. +
  11. J. Liu. LlamaIndex, 11 2022. URL https://github.com/jerryjliu/llama_index .
  12. +
  13. M. Lysak, A. Nassar, N. Livathinos, C. Auer, and P. Staar. Optimized Table Tokenization for Table Structure Recognition. In Document Analysis and Recognition - ICDAR 2023: 17th International Conference, San Jos´ e, CA, USA, August 21-26, 2023, Proceedings, Part II , pages 37-50, Berlin, Heidelberg, Aug. 2023. Springer-Verlag. ISBN 978-3-031-41678-1. doi: 10. 1007/978-3-031-41679-8 3. URL https://doi.org/10.1007/978-3-031-41679-8_3 .
  14. +
  15. L. Mishra, S. Dhibi, Y. Kim, C. Berrospi Ramis, S. Gupta, M. Dolfi, and P. Staar. Statements: Universal information extraction from tables with large language models for ESG KPIs. In D. Stammbach, J. Ni, T. Schimanski, K. Dutia, A. Singh, J. Bingler, C. Christiaen, N. Kushwaha, V. Muccione, S. A. Vaghefi, and M. Leippold, editors, Proceedings of the 1st Workshop on Natural Language Processing Meets Climate Change (ClimateNLP 2024) , pages 193-214, Bangkok, Thailand, Aug. 2024. Association for Computational Linguistics. URL https://aclanthology.org/2024.climatenlp-1.15 .
  16. +
  17. L. Morin, V. Weber, G. I. Meijer, F. Yu, and P. W. J. Staar. Patcid: an open-access dataset of chemical structures in patent documents. Nature Communications , 15(1):6532, August 2024. ISSN 2041-1723. doi: 10.1038/s41467-024-50779-y. URL https://doi.org/10.1038/ s41467-024-50779-y .
  18. +
  19. A. Nassar, N. Livathinos, M. Lysak, and P. Staar. Tableformer: Table structure understanding with transformers. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition , pages 4614-4623, 2022.
  20. +
  21. B. Pfitzmann, C. Auer, M. Dolfi, A. S. Nassar, and P. Staar. Doclaynet: a large humanannotated dataset for document-layout segmentation. pages 3743-3751, 2022.
  22. +
  23. pypdf Maintainers. pypdf: A Pure-Python PDF Library, 2024. URL https://github.com/ py-pdf/pypdf .
  24. +
  25. P. Team. PyPDFium2: Python bindings for PDFium, 2024. URL https://github.com/ pypdfium2-team/pypdfium2 .
  26. +
  27. Y. Zhao, W. Lv, S. Xu, J. Wei, G. Wang, Q. Dang, Y. Liu, and J. Chen. Detrs beat yolos on real-time object detection, 2023.
  28. +
+

Appendix

+

In this section, we illustrate a few examples of Docling's output in Markdown and JSON.

+

DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis

+

DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis

+

Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com

+

Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com

+

Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com

+

Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com

+

Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com

+

ABSTRACT

+

Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large ground-truth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis.

+

CCS CONCEPTS

+

· Informationsystems → Documentstructure ; · Appliedcomputing → Document analysis ; · Computing methodologies → Machine learning Computer vision ; ; Object detection ;

+

Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s). KDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043

+

Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com

+

Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com

+

Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com

+

Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com

+

Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com

+

ABSTRACT

+

Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large groundtruth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis.

+

CCS CONCEPTS

+

Æ Information systems → Document structure ; Æ Applied computing → Document analysis ; Æ Computing methodologies → Machine learning ; Computer vision ; Object detection ;

+

Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s).

+

KDD '22, August 14-18, 2022, Washington, DC, USA ' 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043

+

Figure 1: Four examples of complex page layouts across different document categories

+

KEYWORDS

+

PDF document conversion, layout segmentation, object-detection, data set, Machine Learning

+

ACM Reference Format:

+

Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043

+

AGL Energy Limited ABN 74 1

+

5 061 375

+

Figure 1: Four examples of complex page layouts across different document categories

+

KEYWORDS

+

PDF document conversion, layout segmentation, object-detection, data set, Machine Learning

+

ACMReference Format:

+

Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043

+

1 INTRODUCTION

+

Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1. Figure 2: Title page of the DocLayNet paper (arxiv.org/pdf/2206.01062) - left PDF, right rendered Markdown. If recognized, metadata such as authors are appearing first under the title. Text content inside figures is currently dropped, the caption is retained and linked to the figure in the JSON representation (not shown).

+

KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar

+

Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.

+
humanMRCNN R50 R101FRCNN R101YOLO v5x6
Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All84-89 83-91 83-85 87-88 93-94 85-89 69-71 83-84 77-81 84-8668.4 71.5 70.9 60.1 63.4 81.2 80.8 61.6 59.3 71.9 70.0 71.7 72.7 67.6 69.3 82.2 82.9 85.8 76.7 80.4 72.4 73.570.1 73.7 63.5 81.0 58.9 72.0 72.0 68.4 82.2 85.4 79.9 73.477.7 77.2 66.2 86.2 61.1 67.9 74.6 86.3 88.1 82.7 76.8
+

to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity.

+

5 EXPERIMENTS

+

The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this

+

Third, achienec

+

EXPERIMENTS

+

chalenongayouls ground-vuth dawa such WC

+

Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.

+

paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.

+

In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16].

+

Baselines for Object Detection

+

In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 × 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document.

+

coioct dcochon modols

+

Baselines for Object Detection

+

mak enbrel

+

Figure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table.

+

KDD '22, August 14-18, 2022, Washington, DC, USA

+

Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar

+

Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as %

+

between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.

+

of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric

+
class labelCount% of Totaltriple inter-annotator mAP @ 0.5-0.95 (%)
TrainTestValAllFinManSciLawPatTen
Caption225242.041.772.3284-8940-6186-9294-9995-9969-78n/a
Footnote63180.600.310.5883-91n/a10062-8885-94n/a82-97
Formula250272.251.902.9683-85n/an/a84-8786-96n/an/a
List-item18566017.1913.3415.8287-8874-8390-9297-9781-8575-8893-95
Page-footer708786.515.586.0093-9488-9095-9610092-9710096-98
Page-header580225.106.705.0685-8966-7690-9498-10091-9297-9981-86
Picture459764.212.785.3169-7156-5982-8669-8280-9566-7159-76
Section-header14288412.6015.7712.8583-8476-8190-9294-9587-9469-7378-86
Table347333.202.273.6077-8175-8083-8698-9958-8079-8470-85
Text51037745.8249.2845.0084-8681-8688-9389-9387-9271-7987-95
Title50710.470.300.5060-7224-6350-6394-10082-9668-7924-56
Total1107470941123998166653182-8371-7479-8189-9486-9171-7668-85
+

include publication repositories such as arXiv

+

Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-

+

annotated pages, from which we obtain accuracy ranges.

+
% of Total% of Total% of Totaltriple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)
class labelCountTrainTestValAllFinManSciLawPatTen
Caption225242.041.772.3284-8940-6186-9294-9995-9969-78n/a
Footnote63180.600.310.5883-91n/a10062-8885-94n/a82-97
Formula250272.251.902.9683-85n/an/a84-8786-96n/an/a
List-item18566017.1913.3415.8287-8874-8390-9297-9781-8575-8893-95
Page- footer708786.515.586.0093-9488-9095-9610092-9710096-98
Page- header580225.106.705.0685-8966-7690-9498-10091-9297-9981-86
Picture459764.212.785.3169-7156-5982-8669-8280-9566-7159-76
Section- header14288412.6015.7712.8583-8476-8190-9294-9587-9469-7378-86
Table347333.202.273.6077-8175-8083-8698-9958-8079-8470-85
Text51037745.8249.2845.0084-8681-8688-9389-9387-9271-7987-95
Title50710.470.300.5060-7224-6350-6394-10082-9668-7924-56
Total1107470941123998166653182-8371-7479-8189-9486-9171-7668-85
+

3

+

,

+

government offices,

+

We reviewed the col-

+

,

+

Page-

+

Title and

+

.

+

page. Specificity ensures that the choice of label is not ambiguous,

+

we distributed the annotation workload and performed continuous be annotated. We refrained from class labels that are very specific

+

only. For phases three and four, a group of 40 dedicated annotators while coverage ensures that all meaningful items on a page can

+

quality controls. Phase one and two required a small team of experts to a document category, such as

+

Abstract in the

+

Scientific Articles were assembled and supervised.

+

category. We also avoided class labels that are tightly linked to the

+

Phase 1: Data selection and preparation.

+

Our inclusion cri-

+

Author

+

Affiliation

+

teria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources in DocBank, are often only distinguishable by discriminating on 3 https://arxiv.org/ Figure 4: Table 1 from the DocLayNet paper in the original PDF (A), as rendered Markdown (B) and in JSON representation (C). Spanning table cells, such as the multi-column header 'triple interannotator mAP@0.5-0.95 (%)', is repeated for each column in the Markdown representation (B), which guarantees that every data point can be traced back to row and column headings only by its grid coordinates in the table. In the JSON representation, the span information is reflected in the fields of each table cell (C).

+

semantics of the text. Labels such as and

+

,

+

as seen

diff --git a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.md b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.md index 39e674d8..6adef91e 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.md +++ b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.md @@ -1,9 +1,9 @@ # Docling Technical Report - - In this image we can see a cartoon image of a duck holding a paper. + + Version 1.0 Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar @@ -20,10 +20,95 @@ Converting PDF documents back into a machine-processable format has been a major With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models. +Here is what Docling delivers today: + +- Converts PDF documents to JSON or Markdown format, stable and lightning fast +- Understands detailed page layout, reading order, locates figures and recovers table structures +- Extracts metadata from the document, such as title, authors, references and language +- Optionally applies OCR, e.g. for scanned PDFs +- Can be configured to be optimal for batch-mode (i.e high throughput, low time-to-solution) or interactive mode (compromise on efficiency, low time-to-solution) +- Can leverage different accelerators (GPU, MPS, etc). + +## 2 Getting Started + +To use Docling, you can simply install the docling package from PyPI. Documentation and examples are available in our GitHub repository at [github.com/DS4SD/docling](https://github.com/DS4SD/docling) . All required model assets 1 are downloaded to a local huggingface datasets cache on first use, unless you choose to pre-install the model assets in advance. + +Docling provides an easy code interface to convert PDF documents from file system, URLs or binary streams, and retrieve the output in either JSON or Markdown format. For convenience, separate methods are offered to convert single documents or batches of documents. A basic usage example is illustrated below. Further examples are available in the Doclign code repository. + +``` +from docling.document_converter import DocumentConverter Large +``` + +``` +source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL converter = DocumentConverter() result = converter.convert_single(source) print(result.render_as_markdown()) # output: "## DocLayNet: A Human -Annotated Dataset for Document -Layout Analysis [...]" +``` + +Optionally, you can configure custom pipeline features and runtime options, such as turning on or off features (e.g. OCR, table structure recognition), enforcing limits on the input document size, and defining the budget of CPU threads. Advanced usage examples and options are documented in the README file. Docling also provides a Dockerfile to demonstrate how to install and run it inside a container. + +## 3 Processing pipeline + +Docling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieves the programmatic text tokens, consisting of string content and its coordinates on the page, and also renders a bitmap image of each page to support downstream operations. Then, the standard model pipeline applies a sequence of AI models independently on every page in the document to extract features and content, such as layout and table structures. Finally, the results from all pages are aggregated and passed through a post-processing stage, which augments metadata, detects the document language, infers reading-order and eventually assembles a typed document object which can be serialized to JSON or Markdown. + +## 3.1 PDF backends + +Two basic requirements to process PDF documents in our pipeline are a) to retrieve all text content and their geometric coordinates on each page and b) to render the visual representation of each page as it would appear in a PDF viewer. Both these requirements are encapsulated in Docling's PDF backend interface. While there are several open-source PDF parsing libraries available for python, we faced major obstacles with all of them for different reasons, among which were restrictive + +1 see huggingface.co/ds4sd/docling-models/ + In this image, we can see some text and images. +Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible. + + + +licensing (e.g. pymupdf [7]), poor speed or unrecoverable quality issues, such as merged text cells across far-apart text tokens or table columns (pypdfium, PyPDF) [15, 14]. + +We therefore decided to provide multiple backend choices, and additionally open-source a custombuilt PDF parser, which is based on the low-level qpdf [4] library. It is made available in a separate package named docling-parse and powers the default PDF backend in Docling. As an alternative, we provide a PDF backend relying on pypdfium , which may be a safe backup choice in certain cases, e.g. if issues are seen with particular font encodings. + +## 3.2 AI models + +As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks. + +## Layout Analysis Model + +Our layout analysis model is an object-detector which predicts the bounding-boxes and classes of various elements on the image of a given page. Its architecture is derived from RT-DETR [16] and re-trained on DocLayNet [13], our popular human-annotated dataset for document-layout analysis, among other proprietary datasets. For inference, our implementation relies on the onnxruntime [5]. + +The Docling pipeline feeds page images at 72 dpi resolution, which can be processed on a single CPU with sub-second latency. All predicted bounding-box proposals for document elements are post-processed to remove overlapping proposals based on confidence and size, and then intersected with the text tokens in the PDF to group them into meaningful and complete units such as paragraphs, section titles, list items, captions, figures or tables. + +## Table Structure Recognition + +The TableFormer model [12], first published in 2022 and since refined with a custom structure token language [9], is a vision-transformer model for table structure recovery. It can predict the logical row and column structure of a given table based on an input image, and determine which table cells belong to column headers, row headers or the table body. Compared to earlier approaches, TableFormer handles many characteristics of tables, such as partial or no borderlines, empty cells, rows or columns, cell spans and hierarchy both on column-heading or row-heading level, tables with inconsistent indentation or alignment and other complexities. For inference, our implementation relies on PyTorch [2]. + +The Docling pipeline feeds all table objects detected in the layout analysis to the TableFormer model, by providing an image-crop of the table and the included text cells. TableFormer structure predictions are matched back to the PDF cells in post-processing to avoid expensive re-transcription text in the table image. Typical tables require between 2 and 6 seconds to be processed on a standard CPU, strongly depending on the amount of included table cells. + +## OCR + +Docling provides optional support for OCR, for example to cover scanned PDFs or content in bitmaps images embedded on a page. In our initial release, we rely on EasyOCR [1], a popular thirdparty OCR library with support for many languages. Docling, by default, feeds a high-resolution page image (216 dpi) to the OCR engine, to allow capturing small print detail in decent quality. While EasyOCR delivers reasonable transcription quality, we observe that it runs fairly slow on CPU (upwards of 30 seconds per page). + +We are actively seeking collaboration from the open-source community to extend Docling with additional OCR backends and speed improvements. + +## 3.3 Assembly + +In the final pipeline stage, Docling assembles all prediction results produced on each page into a well-defined datatype that encapsulates a converted document, as defined in the auxiliary package docling-core . The generated document object is passed through a post-processing model which leverages several algorithms to augment features, such as detection of the document language, correcting the reading order, matching figures with captions and labelling metadata such as title, authors and references. The final output can then be serialized to JSON or transformed into a Markdown representation at the users request. + +## 3.4 Extensibility + +Docling provides a straight-forward interface to extend its capabilities, namely the model pipeline. A model pipeline constitutes the central part in the processing, following initial document parsing and preceding output assembly, and can be fully customized by sub-classing from an abstract baseclass ( BaseModelPipeline ) or cloning the default model pipeline. This effectively allows to fully customize the chain of models, add or replace models, and introduce additional pipeline configuration parameters. To use a custom model pipeline, the custom pipeline class to instantiate can be provided as an argument to the main document conversion methods. We invite everyone in the community to propose additional or alternative models and improvements. + +Implementations of model classes must satisfy the python Callable interface. The \_\_call\_\_ method must accept an iterator over page objects, and produce another iterator over the page objects which were augmented with the additional features predicted by the model, by extending the provided PagePredictions data model accordingly. + +## 4 Performance + +In this section, we establish some reference numbers for the processing speed of Docling and the resource budget it requires. All tests in this section are run with default options on our standard test set distributed with Docling, which consists of three papers from arXiv and two IBM Redbooks, with a total of 225 pages. Measurements were taken using both available PDF backends on two different hardware systems: one MacBook Pro M3 Max, and one bare-metal server running Ubuntu 20.04 LTS on an Intel Xeon E5-2690 CPU. For reproducibility, we fixed the thread budget (through setting OMP NUM THREADS environment variable ) once to 4 (Docling default) and once to 16 (equal to full core count on the test hardware). All results are shown in Table 1. + +If you need to run Docling in very low-resource environments, please consider configuring the pypdfium backend. While it is faster and more memory efficient than the default docling-parse backend, it will come at the expense of worse quality results, especially in table structure recovery. + +Establishing GPU acceleration support for the AI models is currently work-in-progress and largely untested, but may work implicitly when CUDA is available and discovered by the onnxruntime and + torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report. +{'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} + Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads. | CPU | Thread budget | native backend | native backend | native backend | pypdfium backend | pypdfium backend | pypdfium backend | @@ -32,8 +117,6 @@ Table 1: Runtime characteristics of Docling with the standard model pipeline and | Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB | | (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB | -{'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} - ## 5 Applications Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets. @@ -49,16 +132,141 @@ We encourage everyone to propose or implement additional features and models, an - [1] J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/ JaidedAI/EasyOCR , 2024. Version: 1.7.0. - [2] J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. Voznesensky, B. Bao, P. Bell, D. Berard, E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison, W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. Lazos, M. Lezcano, Y. Liang, J. Liang, Y. Lu, C. Luk, B. Maher, Y. Pan, C. Puhrsch, M. Reso, M. Saroufim, M. Y. Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang, X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S. Chintala. Pytorch 2: Faster +machine learning through dynamic python bytecode transformation and graph compilation. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2 (ASPLOS '24) . ACM, 4 2024. doi: 10.1145/3620665.3640366. URL https://pytorch.org/assets/pytorch2-2.pdf . + +- [3] C. Auer, M. Dolfi, A. Carvalho, C. B. Ramis, and P. W. Staar. Delivering document conversion as a cloud service with high throughput and responsiveness. In 2022 IEEE 15th International Conference on Cloud Computing (CLOUD) , pages 363-373. IEEE, 2022. +- [4] J. Berkenbilt. Qpdf: A content-preserving pdf document transformer, 2024. URL https: //github.com/qpdf/qpdf . +- [5] O. R. developers. Onnx runtime. https://onnxruntime.ai/ , 2024. Version: 1.18.1. +- [6] IBM. Data Prep Kit: a community project to democratize and accelerate unstructured data preparation for LLM app developers, 2024. URL https://github.com/IBM/ data-prep-kit . +- [7] A. S. Inc. PyMuPDF, 2024. URL https://github.com/pymupdf/PyMuPDF . +- [8] J. Liu. LlamaIndex, 11 2022. URL https://github.com/jerryjliu/llama\_index . +- [9] M. Lysak, A. Nassar, N. Livathinos, C. Auer, and P. Staar. Optimized Table Tokenization for Table Structure Recognition. In Document Analysis and Recognition - ICDAR 2023: 17th International Conference, San Jos´ e, CA, USA, August 21-26, 2023, Proceedings, Part II , pages 37-50, Berlin, Heidelberg, Aug. 2023. Springer-Verlag. ISBN 978-3-031-41678-1. doi: 10. 1007/978-3-031-41679-8 3. URL https://doi.org/10.1007/978-3-031-41679-8\_3 . +- [10] L. Mishra, S. Dhibi, Y. Kim, C. Berrospi Ramis, S. Gupta, M. Dolfi, and P. Staar. Statements: Universal information extraction from tables with large language models for ESG KPIs. In D. Stammbach, J. Ni, T. Schimanski, K. Dutia, A. Singh, J. Bingler, C. Christiaen, N. Kushwaha, V. Muccione, S. A. Vaghefi, and M. Leippold, editors, Proceedings of the 1st Workshop on Natural Language Processing Meets Climate Change (ClimateNLP 2024) , pages 193-214, Bangkok, Thailand, Aug. 2024. Association for Computational Linguistics. URL https://aclanthology.org/2024.climatenlp-1.15 . +- [11] L. Morin, V. Weber, G. I. Meijer, F. Yu, and P. W. J. Staar. Patcid: an open-access dataset of chemical structures in patent documents. Nature Communications , 15(1):6532, August 2024. ISSN 2041-1723. doi: 10.1038/s41467-024-50779-y. URL https://doi.org/10.1038/ s41467-024-50779-y . +- [12] A. Nassar, N. Livathinos, M. Lysak, and P. Staar. Tableformer: Table structure understanding with transformers. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition , pages 4614-4623, 2022. +- [13] B. Pfitzmann, C. Auer, M. Dolfi, A. S. Nassar, and P. Staar. Doclaynet: a large humanannotated dataset for document-layout segmentation. pages 3743-3751, 2022. +- [14] pypdf Maintainers. pypdf: A Pure-Python PDF Library, 2024. URL https://github.com/ py-pdf/pypdf . +- [15] P. Team. PyPDFium2: Python bindings for PDFium, 2024. URL https://github.com/ pypdfium2-team/pypdfium2 . +- [16] Y. Zhao, W. Lv, S. Xu, J. Wei, G. Wang, Q. Dang, Y. Liu, and J. Chen. Detrs beat yolos on real-time object detection, 2023. + +## Appendix + +In this section, we illustrate a few examples of Docling's output in Markdown and JSON. + +## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis + +## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis + +Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com + +Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com + +Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com + +Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com + +Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com + +## ABSTRACT + +Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large ground-truth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis. + +## CCS CONCEPTS + +· Informationsystems → Documentstructure ; · Appliedcomputing → Document analysis ; · Computing methodologies → Machine learning Computer vision ; ; Object detection ; + +Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s). KDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043 + +Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com + +Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com + +Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com + +Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com + +Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com + +## ABSTRACT + +Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large groundtruth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis. + +## CCS CONCEPTS + +Æ Information systems → Document structure ; Æ Applied computing → Document analysis ; Æ Computing methodologies → Machine learning ; Computer vision ; Object detection ; + +Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s). + +KDD '22, August 14-18, 2022, Washington, DC, USA ' 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043 + +Figure 1: Four examples of complex page layouts across different document categories + +## KEYWORDS + +PDF document conversion, layout segmentation, object-detection, data set, Machine Learning + +## ACM Reference Format: + +Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043 + In this image there is a table with some text on it. + + In this image we can see a text. + + +AGL Energy Limited ABN 74 1 + +5 061 375 + In this image I can see the cover of the book. + + In this image there is a paper with some text on it. + + +Figure 1: Four examples of complex page layouts across different document categories + +## KEYWORDS + +PDF document conversion, layout segmentation, object-detection, data set, Machine Learning + +## ACMReference Format: + +Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043 + +1 INTRODUCTION + +Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1. Figure 2: Title page of the DocLayNet paper (arxiv.org/pdf/2206.01062) - left PDF, right rendered Markdown. If recognized, metadata such as authors are appearing first under the title. Text content inside figures is currently dropped, the caption is retained and linked to the figure in the JSON representation (not shown). + +KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar + +Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset. + +| | human | MRCNN R50 R101 | FRCNN R101 | YOLO v5x6 | +|--------------------------------------------------------------------------------------------------------|-------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------|--------------------------------------------------------| +| Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All | 84-89 83-91 83-85 87-88 93-94 85-89 69-71 83-84 77-81 84-86 | 68.4 71.5 70.9 60.1 63.4 81.2 80.8 61.6 59.3 71.9 70.0 71.7 72.7 67.6 69.3 82.2 82.9 85.8 76.7 80.4 72.4 73.5 | 70.1 73.7 63.5 81.0 58.9 72.0 72.0 68.4 82.2 85.4 79.9 73.4 | 77.7 77.2 66.2 86.2 61.1 67.9 74.6 86.3 88.1 82.7 76.8 | + +to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity. + +## 5 EXPERIMENTS + +The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this + In this image, we can see a table with some text. + + +Third, achienec + +## EXPERIMENTS + +chalenongayouls ground-vuth dawa such WC + The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. @@ -69,16 +277,140 @@ The graph has two lines: one for the training program and one for the percentage - **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. - **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%. + + +Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions. + +paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work. + +In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16]. + +## Baselines for Object Detection + +In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 × 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document. + +coioct dcochon modols + +## Baselines for Object Detection + +mak enbrel + +Figure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table. + +KDD '22, August 14-18, 2022, Washington, DC, USA + +Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar + +Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % + +between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges. + +of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric + The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A" + + In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318. + + +| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | +|----------------|---------|--------------|--------------|--------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------| +| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten | +| Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a | +| Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 | +| Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a | +| List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 | +| Page-footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 | +| Page-header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 | +| Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 | +| Section-header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 | +| Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 | +| Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 | +| Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 | +| Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 | + In this image I can see a blue circle. + + +include publication repositories such as arXiv + +Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple- + +annotated pages, from which we obtain accuracy ranges. + A table with different columns and rows. + + +| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | +|-----------------|---------|--------------|--------------|--------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------| +| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten | +| Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a | +| Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 | +| Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a | +| List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 | +| Page- footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 | +| Page- header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 | +| Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 | +| Section- header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 | +| Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 | +| Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 | +| Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 | +| Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 | + +3 + +, + +government offices, + +We reviewed the col- + +, + +Page- + +Title and + +. + +page. Specificity ensures that the choice of label is not ambiguous, + In this image there is a text in the middle. + + + +we distributed the annotation workload and performed continuous be annotated. We refrained from class labels that are very specific + +only. For phases three and four, a group of 40 dedicated annotators while coverage ensures that all meaningful items on a page can + +quality controls. Phase one and two required a small team of experts to a document category, such as + +Abstract in the + +Scientific Articles were assembled and supervised. + +category. We also avoided class labels that are tightly linked to the + +Phase 1: Data selection and preparation. + +Our inclusion cri- + +Author + +Affiliation + +teria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources in DocBank, are often only distinguishable by discriminating on 3 https://arxiv.org/ Figure 4: Table 1 from the DocLayNet paper in the original PDF (A), as rendered Markdown (B) and in JSON representation (C). Spanning table cells, such as the multi-column header 'triple interannotator mAP@0.5-0.95 (%)', is repeated for each column in the Markdown representation (B), which guarantees that every data point can be traced back to row and column headings only by its grid coordinates in the table. In the JSON representation, the span information is reflected in the fields of each table cell (C). + +semantics of the text. Labels such as and + +, + +as seen diff --git a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_true.gt.html b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_true.gt.html index 0bb79d05..9dc6891e 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_true.gt.html +++ b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_true.gt.html @@ -13,6 +13,191 @@

Abstract

1 Introduction

Converting PDF documents back into a machine-processable format has been a major challenge for decades due to their huge variability in formats, weak standardization and printing-optimized characteristic, which discards most structural features and metadata. With the advent of LLMs and popular application patterns such as retrieval-augmented generation (RAG), leveraging the rich content embedded in PDFs has become ever more relevant. In the past decade, several powerful document understanding solutions have emerged on the market, most of which are commercial software, cloud offerings [3] and most recently, multi-modal vision-language models. As of today, only a handful of open-source tools cover PDF conversion, leaving a significant feature and quality gap to proprietary solutions.

With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models.

+

Here is what Docling delivers today:

+
    +
  • Converts PDF documents to JSON or Markdown format, stable and lightning fast
  • +
  • Understands detailed page layout, reading order, locates figures and recovers table structures
  • +
  • Extracts metadata from the document, such as title, authors, references and language
  • +
  • Optionally applies OCR, e.g. for scanned PDFs
  • +
  • Can be configured to be optimal for batch-mode (i.e high throughput, low time-to-solution) or interactive mode (compromise on efficiency, low time-to-solution)
  • +
  • Can leverage different accelerators (GPU, MPS, etc).
  • +
+

2 Getting Started

+To use Docling, you can simply install the docling package from PyPI. Documentation and examples are available in our GitHub repository at github.com/DS4SD/docling . All required model assets 1 are downloaded to a local huggingface datasets cache on first use, unless you choose to pre-install the model assets in advance. +

Docling provides an easy code interface to convert PDF documents from file system, URLs or binary streams, and retrieve the output in either JSON or Markdown format. For convenience, separate methods are offered to convert single documents or batches of documents. A basic usage example is illustrated below. Further examples are available in the Doclign code repository.

+
from docling.document_converter import DocumentConverter Large
+
source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL converter = DocumentConverter() result = converter.convert_single(source) print(result.render_as_markdown()) # output: "## DocLayNet: A Human -Annotated Dataset for Document -Layout Analysis [...]"
+

Optionally, you can configure custom pipeline features and runtime options, such as turning on or off features (e.g. OCR, table structure recognition), enforcing limits on the input document size, and defining the budget of CPU threads. Advanced usage examples and options are documented in the README file. Docling also provides a Dockerfile to demonstrate how to install and run it inside a container.

+

3 Processing pipeline

+

Docling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieves the programmatic text tokens, consisting of string content and its coordinates on the page, and also renders a bitmap image of each page to support downstream operations. Then, the standard model pipeline applies a sequence of AI models independently on every page in the document to extract features and content, such as layout and table structures. Finally, the results from all pages are aggregated and passed through a post-processing stage, which augments metadata, detects the document language, infers reading-order and eventually assembles a typed document object which can be serialized to JSON or Markdown.

+

3.1 PDF backends

+

Two basic requirements to process PDF documents in our pipeline are a) to retrieve all text content and their geometric coordinates on each page and b) to render the visual representation of each page as it would appear in a PDF viewer. Both these requirements are encapsulated in Docling's PDF backend interface. While there are several open-source PDF parsing libraries available for python, we faced major obstacles with all of them for different reasons, among which were restrictive

+

1 see huggingface.co/ds4sd/docling-models/

+
Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.
In this image, we can see some text and images.
+

licensing (e.g. pymupdf [7]), poor speed or unrecoverable quality issues, such as merged text cells across far-apart text tokens or table columns (pypdfium, PyPDF) [15, 14].

+

We therefore decided to provide multiple backend choices, and additionally open-source a custombuilt PDF parser, which is based on the low-level qpdf [4] library. It is made available in a separate package named docling-parse and powers the default PDF backend in Docling. As an alternative, we provide a PDF backend relying on pypdfium , which may be a safe backup choice in certain cases, e.g. if issues are seen with particular font encodings.

+

3.2 AI models

+

As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks.

+

Layout Analysis Model

+

Our layout analysis model is an object-detector which predicts the bounding-boxes and classes of various elements on the image of a given page. Its architecture is derived from RT-DETR [16] and re-trained on DocLayNet [13], our popular human-annotated dataset for document-layout analysis, among other proprietary datasets. For inference, our implementation relies on the onnxruntime [5].

+

The Docling pipeline feeds page images at 72 dpi resolution, which can be processed on a single CPU with sub-second latency. All predicted bounding-box proposals for document elements are post-processed to remove overlapping proposals based on confidence and size, and then intersected with the text tokens in the PDF to group them into meaningful and complete units such as paragraphs, section titles, list items, captions, figures or tables.

+

Table Structure Recognition

+

The TableFormer model [12], first published in 2022 and since refined with a custom structure token language [9], is a vision-transformer model for table structure recovery. It can predict the logical row and column structure of a given table based on an input image, and determine which table cells belong to column headers, row headers or the table body. Compared to earlier approaches, TableFormer handles many characteristics of tables, such as partial or no borderlines, empty cells, rows or columns, cell spans and hierarchy both on column-heading or row-heading level, tables with inconsistent indentation or alignment and other complexities. For inference, our implementation relies on PyTorch [2].

+

The Docling pipeline feeds all table objects detected in the layout analysis to the TableFormer model, by providing an image-crop of the table and the included text cells. TableFormer structure predictions are matched back to the PDF cells in post-processing to avoid expensive re-transcription text in the table image. Typical tables require between 2 and 6 seconds to be processed on a standard CPU, strongly depending on the amount of included table cells.

+

OCR

+

Docling provides optional support for OCR, for example to cover scanned PDFs or content in bitmaps images embedded on a page. In our initial release, we rely on EasyOCR [1], a popular thirdparty OCR library with support for many languages. Docling, by default, feeds a high-resolution page image (216 dpi) to the OCR engine, to allow capturing small print detail in decent quality. While EasyOCR delivers reasonable transcription quality, we observe that it runs fairly slow on CPU (upwards of 30 seconds per page).

+

We are actively seeking collaboration from the open-source community to extend Docling with additional OCR backends and speed improvements.

+

3.3 Assembly

+

In the final pipeline stage, Docling assembles all prediction results produced on each page into a well-defined datatype that encapsulates a converted document, as defined in the auxiliary package docling-core . The generated document object is passed through a post-processing model which leverages several algorithms to augment features, such as detection of the document language, correcting the reading order, matching figures with captions and labelling metadata such as title, authors and references. The final output can then be serialized to JSON or transformed into a Markdown representation at the users request.

+

3.4 Extensibility

+

Docling provides a straight-forward interface to extend its capabilities, namely the model pipeline. A model pipeline constitutes the central part in the processing, following initial document parsing and preceding output assembly, and can be fully customized by sub-classing from an abstract baseclass ( BaseModelPipeline ) or cloning the default model pipeline. This effectively allows to fully customize the chain of models, add or replace models, and introduce additional pipeline configuration parameters. To use a custom model pipeline, the custom pipeline class to instantiate can be provided as an argument to the main document conversion methods. We invite everyone in the community to propose additional or alternative models and improvements.

+

Implementations of model classes must satisfy the python Callable interface. The __call__ method must accept an iterator over page objects, and produce another iterator over the page objects which were augmented with the additional features predicted by the model, by extending the provided PagePredictions data model accordingly.

+

4 Performance

+

In this section, we establish some reference numbers for the processing speed of Docling and the resource budget it requires. All tests in this section are run with default options on our standard test set distributed with Docling, which consists of three papers from arXiv and two IBM Redbooks, with a total of 225 pages. Measurements were taken using both available PDF backends on two different hardware systems: one MacBook Pro M3 Max, and one bare-metal server running Ubuntu 20.04 LTS on an Intel Xeon E5-2690 CPU. For reproducibility, we fixed the thread budget (through setting OMP NUM THREADS environment variable ) once to 4 (Docling default) and once to 16 (equal to full core count on the test hardware). All results are shown in Table 1.

+

If you need to run Docling in very low-resource environments, please consider configuring the pypdfium backend. While it is faster and more memory efficient than the default docling-parse backend, it will come at the expense of worse quality results, especially in table structure recovery.

+

Establishing GPU acceleration support for the AI models is currently work-in-progress and largely untested, but may work implicitly when CUDA is available and discovered by the onnxruntime and

+

torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report.

+
Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads.
CPUThread budgetnative backendpypdfium backend
TTSPages/sMemTTSPages/sMem
Apple M3 Max4177 s 167 s1.27 1.346.20 GB103 s 92 s2.18 2.452.56 GB
(16 cores) Intel(R) Xeon E5-269016 4 16375 s 244 s0.60 0.926.16 GB239 s 143 s0.94 1.572.42 GB
+

5 Applications

+

Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets.

+

6 Future work and contributions

+

Docling is designed to allow easy extension of the model library and pipelines. In the future, we plan to extend Docling with several more models, such as a figure-classifier model, an equationrecognition model, a code-recognition model and more. This will help improve the quality of conversion for specific types of content, as well as augment extracted document metadata with additional information. Further investment into testing and optimizing GPU acceleration as well as improving the Docling-native PDF backend are on our roadmap, too.

+

We encourage everyone to propose or implement additional features and models, and will gladly take your inputs and contributions under review . The codebase of Docling is open for use and contribution, under the MIT license agreement and in alignment with our contributing guidelines included in the Docling repository. If you use Docling in your projects, please consider citing this technical report.

+

References

+
    +
  1. J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/ JaidedAI/EasyOCR , 2024. Version: 1.7.0.
  2. +
  3. J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. Voznesensky, B. Bao, P. Bell, D. Berard, E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison, W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. Lazos, M. Lezcano, Y. Liang, J. Liang, Y. Lu, C. Luk, B. Maher, Y. Pan, C. Puhrsch, M. Reso, M. Saroufim, M. Y. Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang, X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S. Chintala. Pytorch 2: Faster
  4. +
+

machine learning through dynamic python bytecode transformation and graph compilation. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2 (ASPLOS '24) . ACM, 4 2024. doi: 10.1145/3620665.3640366. URL https://pytorch.org/assets/pytorch2-2.pdf .

+
    +
  1. C. Auer, M. Dolfi, A. Carvalho, C. B. Ramis, and P. W. Staar. Delivering document conversion as a cloud service with high throughput and responsiveness. In 2022 IEEE 15th International Conference on Cloud Computing (CLOUD) , pages 363-373. IEEE, 2022.
  2. +
  3. J. Berkenbilt. Qpdf: A content-preserving pdf document transformer, 2024. URL https: //github.com/qpdf/qpdf .
  4. +
  5. O. R. developers. Onnx runtime. https://onnxruntime.ai/ , 2024. Version: 1.18.1.
  6. +
  7. IBM. Data Prep Kit: a community project to democratize and accelerate unstructured data preparation for LLM app developers, 2024. URL https://github.com/IBM/ data-prep-kit .
  8. +
  9. A. S. Inc. PyMuPDF, 2024. URL https://github.com/pymupdf/PyMuPDF .
  10. +
  11. J. Liu. LlamaIndex, 11 2022. URL https://github.com/jerryjliu/llama_index .
  12. +
  13. M. Lysak, A. Nassar, N. Livathinos, C. Auer, and P. Staar. Optimized Table Tokenization for Table Structure Recognition. In Document Analysis and Recognition - ICDAR 2023: 17th International Conference, San Jos´ e, CA, USA, August 21-26, 2023, Proceedings, Part II , pages 37-50, Berlin, Heidelberg, Aug. 2023. Springer-Verlag. ISBN 978-3-031-41678-1. doi: 10. 1007/978-3-031-41679-8 3. URL https://doi.org/10.1007/978-3-031-41679-8_3 .
  14. +
  15. L. Mishra, S. Dhibi, Y. Kim, C. Berrospi Ramis, S. Gupta, M. Dolfi, and P. Staar. Statements: Universal information extraction from tables with large language models for ESG KPIs. In D. Stammbach, J. Ni, T. Schimanski, K. Dutia, A. Singh, J. Bingler, C. Christiaen, N. Kushwaha, V. Muccione, S. A. Vaghefi, and M. Leippold, editors, Proceedings of the 1st Workshop on Natural Language Processing Meets Climate Change (ClimateNLP 2024) , pages 193-214, Bangkok, Thailand, Aug. 2024. Association for Computational Linguistics. URL https://aclanthology.org/2024.climatenlp-1.15 .
  16. +
  17. L. Morin, V. Weber, G. I. Meijer, F. Yu, and P. W. J. Staar. Patcid: an open-access dataset of chemical structures in patent documents. Nature Communications , 15(1):6532, August 2024. ISSN 2041-1723. doi: 10.1038/s41467-024-50779-y. URL https://doi.org/10.1038/ s41467-024-50779-y .
  18. +
  19. A. Nassar, N. Livathinos, M. Lysak, and P. Staar. Tableformer: Table structure understanding with transformers. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition , pages 4614-4623, 2022.
  20. +
  21. B. Pfitzmann, C. Auer, M. Dolfi, A. S. Nassar, and P. Staar. Doclaynet: a large humanannotated dataset for document-layout segmentation. pages 3743-3751, 2022.
  22. +
  23. pypdf Maintainers. pypdf: A Pure-Python PDF Library, 2024. URL https://github.com/ py-pdf/pypdf .
  24. +
  25. P. Team. PyPDFium2: Python bindings for PDFium, 2024. URL https://github.com/ pypdfium2-team/pypdfium2 .
  26. +
  27. Y. Zhao, W. Lv, S. Xu, J. Wei, G. Wang, Q. Dang, Y. Liu, and J. Chen. Detrs beat yolos on real-time object detection, 2023.
  28. +
+

Appendix

+

In this section, we illustrate a few examples of Docling's output in Markdown and JSON.

+

DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis

+

DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis

+

Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com

+

Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com

+

Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com

+

Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com

+

Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com

+

ABSTRACT

+

Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large ground-truth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis.

+

CCS CONCEPTS

+

· Informationsystems → Documentstructure ; · Appliedcomputing → Document analysis ; · Computing methodologies → Machine learning Computer vision ; ; Object detection ;

+

Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s). KDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043

+

Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com

+

Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com

+

Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com

+

Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com

+

Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com

+

ABSTRACT

+

Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large groundtruth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis.

+

CCS CONCEPTS

+

Æ Information systems → Document structure ; Æ Applied computing → Document analysis ; Æ Computing methodologies → Machine learning ; Computer vision ; Object detection ;

+

Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s).

+

KDD '22, August 14-18, 2022, Washington, DC, USA ' 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043

+

Figure 1: Four examples of complex page layouts across different document categories

+

KEYWORDS

+

PDF document conversion, layout segmentation, object-detection, data set, Machine Learning

+

ACM Reference Format:

+

Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043

+
In this image there is a table with some text on it.
+
In this image we can see a text.
+

AGL Energy Limited ABN 74 1

+

5 061 375

+
In this image I can see the cover of the book.
+
In this image there is a paper with some text on it.
+

Figure 1: Four examples of complex page layouts across different document categories

+

KEYWORDS

+

PDF document conversion, layout segmentation, object-detection, data set, Machine Learning

+

ACMReference Format:

+

Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043

+

1 INTRODUCTION

+

Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1. Figure 2: Title page of the DocLayNet paper (arxiv.org/pdf/2206.01062) - left PDF, right rendered Markdown. If recognized, metadata such as authors are appearing first under the title. Text content inside figures is currently dropped, the caption is retained and linked to the figure in the JSON representation (not shown).

+

KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar

+

Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.

+
humanMRCNN R50 R101FRCNN R101YOLO v5x6
Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All84-89 83-91 83-85 87-88 93-94 85-89 69-71 83-84 77-81 84-8668.4 71.5 70.9 60.1 63.4 81.2 80.8 61.6 59.3 71.9 70.0 71.7 72.7 67.6 69.3 82.2 82.9 85.8 76.7 80.4 72.4 73.570.1 73.7 63.5 81.0 58.9 72.0 72.0 68.4 82.2 85.4 79.9 73.477.7 77.2 66.2 86.2 61.1 67.9 74.6 86.3 88.1 82.7 76.8
+

to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity.

+

5 EXPERIMENTS

+

The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this

+
In this image, we can see a table with some text.
+

Third, achienec

+

EXPERIMENTS

+

chalenongayouls ground-vuth dawa such WC

+
The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. + +The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. + +### Analysis: + +#### Training Program: +- **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. +- **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%.
+

Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.

+

paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.

+

In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16].

+

Baselines for Object Detection

+

In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 × 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document.

+

coioct dcochon modols

+

Baselines for Object Detection

+

mak enbrel

+

Figure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table.

+

KDD '22, August 14-18, 2022, Washington, DC, USA

+

Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar

+

Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as %

+

between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.

+

of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric

+
The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. + +The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. + +The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A"
+
In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318.
+
class labelCount% of Totaltriple inter-annotator mAP @ 0.5-0.95 (%)
TrainTestValAllFinManSciLawPatTen
Caption225242.041.772.3284-8940-6186-9294-9995-9969-78n/a
Footnote63180.600.310.5883-91n/a10062-8885-94n/a82-97
Formula250272.251.902.9683-85n/an/a84-8786-96n/an/a
List-item18566017.1913.3415.8287-8874-8390-9297-9781-8575-8893-95
Page-footer708786.515.586.0093-9488-9095-9610092-9710096-98
Page-header580225.106.705.0685-8966-7690-9498-10091-9297-9981-86
Picture459764.212.785.3169-7156-5982-8669-8280-9566-7159-76
Section-header14288412.6015.7712.8583-8476-8190-9294-9587-9469-7378-86
Table347333.202.273.6077-8175-8083-8698-9958-8079-8470-85
Text51037745.8249.2845.0084-8681-8688-9389-9387-9271-7987-95
Title50710.470.300.5060-7224-6350-6394-10082-9668-7924-56
Total1107470941123998166653182-8371-7479-8189-9486-9171-7668-85
+
In this image I can see a blue circle.
+

include publication repositories such as arXiv

+

Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-

+

annotated pages, from which we obtain accuracy ranges.

+
A table with different columns and rows.
+
% of Total% of Total% of Totaltriple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)
class labelCountTrainTestValAllFinManSciLawPatTen
Caption225242.041.772.3284-8940-6186-9294-9995-9969-78n/a
Footnote63180.600.310.5883-91n/a10062-8885-94n/a82-97
Formula250272.251.902.9683-85n/an/a84-8786-96n/an/a
List-item18566017.1913.3415.8287-8874-8390-9297-9781-8575-8893-95
Page- footer708786.515.586.0093-9488-9095-9610092-9710096-98
Page- header580225.106.705.0685-8966-7690-9498-10091-9297-9981-86
Picture459764.212.785.3169-7156-5982-8669-8280-9566-7159-76
Section- header14288412.6015.7712.8583-8476-8190-9294-9587-9469-7378-86
Table347333.202.273.6077-8175-8083-8698-9958-8079-8470-85
Text51037745.8249.2845.0084-8681-8688-9389-9387-9271-7987-95
Title50710.470.300.5060-7224-6350-6394-10082-9668-7924-56
Total1107470941123998166653182-8371-7479-8189-9486-9171-7668-85
+

3

+

,

+

government offices,

+

We reviewed the col-

+

,

+

Page-

+

Title and

+

.

+

page. Specificity ensures that the choice of label is not ambiguous,

+
In this image there is a text in the middle.
+

we distributed the annotation workload and performed continuous be annotated. We refrained from class labels that are very specific

+

only. For phases three and four, a group of 40 dedicated annotators while coverage ensures that all meaningful items on a page can

+

quality controls. Phase one and two required a small team of experts to a document category, such as

+

Abstract in the

+

Scientific Articles were assembled and supervised.

+

category. We also avoided class labels that are tightly linked to the

+

Phase 1: Data selection and preparation.

+

Our inclusion cri-

+

Author

+

Affiliation

+

teria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources in DocBank, are often only distinguishable by discriminating on 3 https://arxiv.org/ Figure 4: Table 1 from the DocLayNet paper in the original PDF (A), as rendered Markdown (B) and in JSON representation (C). Spanning table cells, such as the multi-column header 'triple interannotator mAP@0.5-0.95 (%)', is repeated for each column in the Markdown representation (B), which guarantees that every data point can be traced back to row and column headings only by its grid coordinates in the table. In the JSON representation, the span information is reflected in the fields of each table cell (C).

+

semantics of the text. Labels such as and

+

,

+

as seen

diff --git a/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md b/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md index ac188c4a..1b8cda26 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md +++ b/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md @@ -2,10 +2,10 @@ In this image we can see a cartoon image of a duck holding a paper. - - In this image we can see a cartoon image of a duck holding a paper. + + Version 1.0 Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar @@ -22,10 +22,97 @@ Converting PDF documents back into a machine-processable format has been a major With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models. +Here is what Docling delivers today: + +- Converts PDF documents to JSON or Markdown format, stable and lightning fast +- Understands detailed page layout, reading order, locates figures and recovers table structures +- Extracts metadata from the document, such as title, authors, references and language +- Optionally applies OCR, e.g. for scanned PDFs +- Can be configured to be optimal for batch-mode (i.e high throughput, low time-to-solution) or interactive mode (compromise on efficiency, low time-to-solution) +- Can leverage different accelerators (GPU, MPS, etc). + +## 2 Getting Started + +To use Docling, you can simply install the docling package from PyPI. Documentation and examples are available in our GitHub repository at [github.com/DS4SD/docling](https://github.com/DS4SD/docling) . All required model assets 1 are downloaded to a local huggingface datasets cache on first use, unless you choose to pre-install the model assets in advance. + +Docling provides an easy code interface to convert PDF documents from file system, URLs or binary streams, and retrieve the output in either JSON or Markdown format. For convenience, separate methods are offered to convert single documents or batches of documents. A basic usage example is illustrated below. Further examples are available in the Doclign code repository. + +``` +from docling.document_converter import DocumentConverter Large +``` + +``` +source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL converter = DocumentConverter() result = converter.convert_single(source) print(result.render_as_markdown()) # output: "## DocLayNet: A Human -Annotated Dataset for Document -Layout Analysis [...]" +``` + +Optionally, you can configure custom pipeline features and runtime options, such as turning on or off features (e.g. OCR, table structure recognition), enforcing limits on the input document size, and defining the budget of CPU threads. Advanced usage examples and options are documented in the README file. Docling also provides a Dockerfile to demonstrate how to install and run it inside a container. + +## 3 Processing pipeline + +Docling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieves the programmatic text tokens, consisting of string content and its coordinates on the page, and also renders a bitmap image of each page to support downstream operations. Then, the standard model pipeline applies a sequence of AI models independently on every page in the document to extract features and content, such as layout and table structures. Finally, the results from all pages are aggregated and passed through a post-processing stage, which augments metadata, detects the document language, infers reading-order and eventually assembles a typed document object which can be serialized to JSON or Markdown. + +## 3.1 PDF backends + +Two basic requirements to process PDF documents in our pipeline are a) to retrieve all text content and their geometric coordinates on each page and b) to render the visual representation of each page as it would appear in a PDF viewer. Both these requirements are encapsulated in Docling's PDF backend interface. While there are several open-source PDF parsing libraries available for python, we faced major obstacles with all of them for different reasons, among which were restrictive + +1 see huggingface.co/ds4sd/docling-models/ + +In this image, we can see some text and images. + +Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible. + In this image, we can see some text and images. + + +licensing (e.g. pymupdf [7]), poor speed or unrecoverable quality issues, such as merged text cells across far-apart text tokens or table columns (pypdfium, PyPDF) [15, 14]. + +We therefore decided to provide multiple backend choices, and additionally open-source a custombuilt PDF parser, which is based on the low-level qpdf [4] library. It is made available in a separate package named docling-parse and powers the default PDF backend in Docling. As an alternative, we provide a PDF backend relying on pypdfium , which may be a safe backup choice in certain cases, e.g. if issues are seen with particular font encodings. + +## 3.2 AI models + +As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks. + +## Layout Analysis Model + +Our layout analysis model is an object-detector which predicts the bounding-boxes and classes of various elements on the image of a given page. Its architecture is derived from RT-DETR [16] and re-trained on DocLayNet [13], our popular human-annotated dataset for document-layout analysis, among other proprietary datasets. For inference, our implementation relies on the onnxruntime [5]. + +The Docling pipeline feeds page images at 72 dpi resolution, which can be processed on a single CPU with sub-second latency. All predicted bounding-box proposals for document elements are post-processed to remove overlapping proposals based on confidence and size, and then intersected with the text tokens in the PDF to group them into meaningful and complete units such as paragraphs, section titles, list items, captions, figures or tables. + +## Table Structure Recognition + +The TableFormer model [12], first published in 2022 and since refined with a custom structure token language [9], is a vision-transformer model for table structure recovery. It can predict the logical row and column structure of a given table based on an input image, and determine which table cells belong to column headers, row headers or the table body. Compared to earlier approaches, TableFormer handles many characteristics of tables, such as partial or no borderlines, empty cells, rows or columns, cell spans and hierarchy both on column-heading or row-heading level, tables with inconsistent indentation or alignment and other complexities. For inference, our implementation relies on PyTorch [2]. + +The Docling pipeline feeds all table objects detected in the layout analysis to the TableFormer model, by providing an image-crop of the table and the included text cells. TableFormer structure predictions are matched back to the PDF cells in post-processing to avoid expensive re-transcription text in the table image. Typical tables require between 2 and 6 seconds to be processed on a standard CPU, strongly depending on the amount of included table cells. + +## OCR + +Docling provides optional support for OCR, for example to cover scanned PDFs or content in bitmaps images embedded on a page. In our initial release, we rely on EasyOCR [1], a popular thirdparty OCR library with support for many languages. Docling, by default, feeds a high-resolution page image (216 dpi) to the OCR engine, to allow capturing small print detail in decent quality. While EasyOCR delivers reasonable transcription quality, we observe that it runs fairly slow on CPU (upwards of 30 seconds per page). + +We are actively seeking collaboration from the open-source community to extend Docling with additional OCR backends and speed improvements. + +## 3.3 Assembly + +In the final pipeline stage, Docling assembles all prediction results produced on each page into a well-defined datatype that encapsulates a converted document, as defined in the auxiliary package docling-core . The generated document object is passed through a post-processing model which leverages several algorithms to augment features, such as detection of the document language, correcting the reading order, matching figures with captions and labelling metadata such as title, authors and references. The final output can then be serialized to JSON or transformed into a Markdown representation at the users request. + +## 3.4 Extensibility + +Docling provides a straight-forward interface to extend its capabilities, namely the model pipeline. A model pipeline constitutes the central part in the processing, following initial document parsing and preceding output assembly, and can be fully customized by sub-classing from an abstract baseclass ( BaseModelPipeline ) or cloning the default model pipeline. This effectively allows to fully customize the chain of models, add or replace models, and introduce additional pipeline configuration parameters. To use a custom model pipeline, the custom pipeline class to instantiate can be provided as an argument to the main document conversion methods. We invite everyone in the community to propose additional or alternative models and improvements. + +Implementations of model classes must satisfy the python Callable interface. The \_\_call\_\_ method must accept an iterator over page objects, and produce another iterator over the page objects which were augmented with the additional features predicted by the model, by extending the provided PagePredictions data model accordingly. + +## 4 Performance + +In this section, we establish some reference numbers for the processing speed of Docling and the resource budget it requires. All tests in this section are run with default options on our standard test set distributed with Docling, which consists of three papers from arXiv and two IBM Redbooks, with a total of 225 pages. Measurements were taken using both available PDF backends on two different hardware systems: one MacBook Pro M3 Max, and one bare-metal server running Ubuntu 20.04 LTS on an Intel Xeon E5-2690 CPU. For reproducibility, we fixed the thread budget (through setting OMP NUM THREADS environment variable ) once to 4 (Docling default) and once to 16 (equal to full core count on the test hardware). All results are shown in Table 1. + +If you need to run Docling in very low-resource environments, please consider configuring the pypdfium backend. While it is faster and more memory efficient than the default docling-parse backend, it will come at the expense of worse quality results, especially in table structure recovery. + +Establishing GPU acceleration support for the AI models is currently work-in-progress and largely untested, but may work implicitly when CUDA is available and discovered by the onnxruntime and + torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report. +{'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} + summary: Typical Docling setup runtime characterization. type: performance data @@ -37,8 +124,6 @@ Table 1: Runtime characteristics of Docling with the standard model pipeline and | Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB | | (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB | -{'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} - ## 5 Applications Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets. @@ -54,16 +139,151 @@ We encourage everyone to propose or implement additional features and models, an - [1] J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/ JaidedAI/EasyOCR , 2024. Version: 1.7.0. - [2] J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. Voznesensky, B. Bao, P. Bell, D. Berard, E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison, W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. Lazos, M. Lezcano, Y. Liang, J. Liang, Y. Lu, C. Luk, B. Maher, Y. Pan, C. Puhrsch, M. Reso, M. Saroufim, M. Y. Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang, X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S. Chintala. Pytorch 2: Faster +machine learning through dynamic python bytecode transformation and graph compilation. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2 (ASPLOS '24) . ACM, 4 2024. doi: 10.1145/3620665.3640366. URL https://pytorch.org/assets/pytorch2-2.pdf . + +- [3] C. Auer, M. Dolfi, A. Carvalho, C. B. Ramis, and P. W. Staar. Delivering document conversion as a cloud service with high throughput and responsiveness. In 2022 IEEE 15th International Conference on Cloud Computing (CLOUD) , pages 363-373. IEEE, 2022. +- [4] J. Berkenbilt. Qpdf: A content-preserving pdf document transformer, 2024. URL https: //github.com/qpdf/qpdf . +- [5] O. R. developers. Onnx runtime. https://onnxruntime.ai/ , 2024. Version: 1.18.1. +- [6] IBM. Data Prep Kit: a community project to democratize and accelerate unstructured data preparation for LLM app developers, 2024. URL https://github.com/IBM/ data-prep-kit . +- [7] A. S. Inc. PyMuPDF, 2024. URL https://github.com/pymupdf/PyMuPDF . +- [8] J. Liu. LlamaIndex, 11 2022. URL https://github.com/jerryjliu/llama\_index . +- [9] M. Lysak, A. Nassar, N. Livathinos, C. Auer, and P. Staar. Optimized Table Tokenization for Table Structure Recognition. In Document Analysis and Recognition - ICDAR 2023: 17th International Conference, San Jos´ e, CA, USA, August 21-26, 2023, Proceedings, Part II , pages 37-50, Berlin, Heidelberg, Aug. 2023. Springer-Verlag. ISBN 978-3-031-41678-1. doi: 10. 1007/978-3-031-41679-8 3. URL https://doi.org/10.1007/978-3-031-41679-8\_3 . +- [10] L. Mishra, S. Dhibi, Y. Kim, C. Berrospi Ramis, S. Gupta, M. Dolfi, and P. Staar. Statements: Universal information extraction from tables with large language models for ESG KPIs. In D. Stammbach, J. Ni, T. Schimanski, K. Dutia, A. Singh, J. Bingler, C. Christiaen, N. Kushwaha, V. Muccione, S. A. Vaghefi, and M. Leippold, editors, Proceedings of the 1st Workshop on Natural Language Processing Meets Climate Change (ClimateNLP 2024) , pages 193-214, Bangkok, Thailand, Aug. 2024. Association for Computational Linguistics. URL https://aclanthology.org/2024.climatenlp-1.15 . +- [11] L. Morin, V. Weber, G. I. Meijer, F. Yu, and P. W. J. Staar. Patcid: an open-access dataset of chemical structures in patent documents. Nature Communications , 15(1):6532, August 2024. ISSN 2041-1723. doi: 10.1038/s41467-024-50779-y. URL https://doi.org/10.1038/ s41467-024-50779-y . +- [12] A. Nassar, N. Livathinos, M. Lysak, and P. Staar. Tableformer: Table structure understanding with transformers. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition , pages 4614-4623, 2022. +- [13] B. Pfitzmann, C. Auer, M. Dolfi, A. S. Nassar, and P. Staar. Doclaynet: a large humanannotated dataset for document-layout segmentation. pages 3743-3751, 2022. +- [14] pypdf Maintainers. pypdf: A Pure-Python PDF Library, 2024. URL https://github.com/ py-pdf/pypdf . +- [15] P. Team. PyPDFium2: Python bindings for PDFium, 2024. URL https://github.com/ pypdfium2-team/pypdfium2 . +- [16] Y. Zhao, W. Lv, S. Xu, J. Wei, G. Wang, Q. Dang, Y. Liu, and J. Chen. Detrs beat yolos on real-time object detection, 2023. + +## Appendix + +In this section, we illustrate a few examples of Docling's output in Markdown and JSON. + +## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis + +## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis + +Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com + +Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com + +Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com + +Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com + +Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com + +## ABSTRACT + +Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large ground-truth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis. + +## CCS CONCEPTS + +· Informationsystems → Documentstructure ; · Appliedcomputing → Document analysis ; · Computing methodologies → Machine learning Computer vision ; ; Object detection ; + +Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s). KDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043 + +Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com + +Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com + +Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com + +Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com + +Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com + +## ABSTRACT + +Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large groundtruth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis. + +## CCS CONCEPTS + +Æ Information systems → Document structure ; Æ Applied computing → Document analysis ; Æ Computing methodologies → Machine learning ; Computer vision ; Object detection ; + +Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s). + +KDD '22, August 14-18, 2022, Washington, DC, USA ' 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043 + +Figure 1: Four examples of complex page layouts across different document categories + +## KEYWORDS + +PDF document conversion, layout segmentation, object-detection, data set, Machine Learning + +## ACM Reference Format: + +Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043 + In this image there is a table with some text on it. +In this image there is a table with some text on it. + + + +In this image we can see a text. + In this image we can see a text. + + +AGL Energy Limited ABN 74 1 + +5 061 375 + +In this image I can see the cover of the book. + In this image I can see the cover of the book. + + +In this image there is a paper with some text on it. + In this image there is a paper with some text on it. + + +Figure 1: Four examples of complex page layouts across different document categories + +## KEYWORDS + +PDF document conversion, layout segmentation, object-detection, data set, Machine Learning + +## ACMReference Format: + +Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043 + +1 INTRODUCTION + +Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1. Figure 2: Title page of the DocLayNet paper (arxiv.org/pdf/2206.01062) - left PDF, right rendered Markdown. If recognized, metadata such as authors are appearing first under the title. Text content inside figures is currently dropped, the caption is retained and linked to the figure in the JSON representation (not shown). + +KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar + +Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset. + +| | human | MRCNN R50 R101 | FRCNN R101 | YOLO v5x6 | +|--------------------------------------------------------------------------------------------------------|-------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------|--------------------------------------------------------| +| Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All | 84-89 83-91 83-85 87-88 93-94 85-89 69-71 83-84 77-81 84-86 | 68.4 71.5 70.9 60.1 63.4 81.2 80.8 61.6 59.3 71.9 70.0 71.7 72.7 67.6 69.3 82.2 82.9 85.8 76.7 80.4 72.4 73.5 | 70.1 73.7 63.5 81.0 58.9 72.0 72.0 68.4 82.2 85.4 79.9 73.4 | 77.7 77.2 66.2 86.2 61.1 67.9 74.6 86.3 88.1 82.7 76.8 | + +to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity. + +## 5 EXPERIMENTS + +The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this + In this image, we can see a table with some text. +In this image, we can see a table with some text. + + + +Third, achienec + +## EXPERIMENTS + +chalenongayouls ground-vuth dawa such WC + The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. @@ -74,16 +294,164 @@ The graph has two lines: one for the training program and one for the percentage - **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. - **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%. +The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. + +The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. + +### Analysis: + +#### Training Program: +- **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. +- **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%. + + + +Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions. + +paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work. + +In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16]. + +## Baselines for Object Detection + +In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 × 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document. + +coioct dcochon modols + +## Baselines for Object Detection + +mak enbrel + +Figure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table. + +KDD '22, August 14-18, 2022, Washington, DC, USA + +Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar + +Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % + +between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges. + +of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric + +The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. + +The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. + +The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A" + The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A" + + +In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318. + In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318. + + +| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | +|----------------|---------|--------------|--------------|--------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------| +| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten | +| Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a | +| Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 | +| Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a | +| List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 | +| Page-footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 | +| Page-header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 | +| Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 | +| Section-header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 | +| Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 | +| Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 | +| Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 | +| Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 | + In this image I can see a blue circle. +In this image I can see a blue circle. + + + +include publication repositories such as arXiv + +Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple- + +annotated pages, from which we obtain accuracy ranges. + +A table with different columns and rows. + A table with different columns and rows. + + +| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | +|-----------------|---------|--------------|--------------|--------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------| +| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten | +| Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a | +| Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 | +| Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a | +| List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 | +| Page- footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 | +| Page- header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 | +| Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 | +| Section- header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 | +| Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 | +| Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 | +| Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 | +| Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 | + +3 + +, + +government offices, + +We reviewed the col- + +, + +Page- + +Title and + +. + +page. Specificity ensures that the choice of label is not ambiguous, + In this image there is a text in the middle. + +In this image there is a text in the middle. + + + +we distributed the annotation workload and performed continuous be annotated. We refrained from class labels that are very specific + +only. For phases three and four, a group of 40 dedicated annotators while coverage ensures that all meaningful items on a page can + +quality controls. Phase one and two required a small team of experts to a document category, such as + +Abstract in the + +Scientific Articles were assembled and supervised. + +category. We also avoided class labels that are tightly linked to the + +Phase 1: Data selection and preparation. + +Our inclusion cri- + +Author + +Affiliation + +teria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources in DocBank, are often only distinguishable by discriminating on 3 https://arxiv.org/ Figure 4: Table 1 from the DocLayNet paper in the original PDF (A), as rendered Markdown (B) and in JSON representation (C). Spanning table cells, such as the multi-column header 'triple interannotator mAP@0.5-0.95 (%)', is repeated for each column in the Markdown representation (B), which guarantees that every data point can be traced back to row and column headings only by its grid coordinates in the table. In the JSON representation, the span information is reflected in the fields of each table cell (C). + +semantics of the text. Labels such as and + +, + +as seen diff --git a/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_true.gt.md b/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_true.gt.md index 55d9d60c..10c9ce4d 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_true.gt.md +++ b/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_true.gt.md @@ -1,11 +1,11 @@ # Docling Technical Report +In this image we can see a cartoon image of a duck holding a paper. + In this image we can see a cartoon image of a duck holding a paper. -In this image we can see a cartoon image of a duck holding a paper. - Version 1.0 Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar @@ -22,10 +22,97 @@ Converting PDF documents back into a machine-processable format has been a major With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models. +Here is what Docling delivers today: + +- Converts PDF documents to JSON or Markdown format, stable and lightning fast +- Understands detailed page layout, reading order, locates figures and recovers table structures +- Extracts metadata from the document, such as title, authors, references and language +- Optionally applies OCR, e.g. for scanned PDFs +- Can be configured to be optimal for batch-mode (i.e high throughput, low time-to-solution) or interactive mode (compromise on efficiency, low time-to-solution) +- Can leverage different accelerators (GPU, MPS, etc). + +## 2 Getting Started + +To use Docling, you can simply install the docling package from PyPI. Documentation and examples are available in our GitHub repository at [github.com/DS4SD/docling](https://github.com/DS4SD/docling) . All required model assets 1 are downloaded to a local huggingface datasets cache on first use, unless you choose to pre-install the model assets in advance. + +Docling provides an easy code interface to convert PDF documents from file system, URLs or binary streams, and retrieve the output in either JSON or Markdown format. For convenience, separate methods are offered to convert single documents or batches of documents. A basic usage example is illustrated below. Further examples are available in the Doclign code repository. + +``` +from docling.document_converter import DocumentConverter Large +``` + +``` +source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL converter = DocumentConverter() result = converter.convert_single(source) print(result.render_as_markdown()) # output: "## DocLayNet: A Human -Annotated Dataset for Document -Layout Analysis [...]" +``` + +Optionally, you can configure custom pipeline features and runtime options, such as turning on or off features (e.g. OCR, table structure recognition), enforcing limits on the input document size, and defining the budget of CPU threads. Advanced usage examples and options are documented in the README file. Docling also provides a Dockerfile to demonstrate how to install and run it inside a container. + +## 3 Processing pipeline + +Docling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieves the programmatic text tokens, consisting of string content and its coordinates on the page, and also renders a bitmap image of each page to support downstream operations. Then, the standard model pipeline applies a sequence of AI models independently on every page in the document to extract features and content, such as layout and table structures. Finally, the results from all pages are aggregated and passed through a post-processing stage, which augments metadata, detects the document language, infers reading-order and eventually assembles a typed document object which can be serialized to JSON or Markdown. + +## 3.1 PDF backends + +Two basic requirements to process PDF documents in our pipeline are a) to retrieve all text content and their geometric coordinates on each page and b) to render the visual representation of each page as it would appear in a PDF viewer. Both these requirements are encapsulated in Docling's PDF backend interface. While there are several open-source PDF parsing libraries available for python, we faced major obstacles with all of them for different reasons, among which were restrictive + +1 see huggingface.co/ds4sd/docling-models/ + In this image, we can see some text and images. +Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible. + +In this image, we can see some text and images. + + + +licensing (e.g. pymupdf [7]), poor speed or unrecoverable quality issues, such as merged text cells across far-apart text tokens or table columns (pypdfium, PyPDF) [15, 14]. + +We therefore decided to provide multiple backend choices, and additionally open-source a custombuilt PDF parser, which is based on the low-level qpdf [4] library. It is made available in a separate package named docling-parse and powers the default PDF backend in Docling. As an alternative, we provide a PDF backend relying on pypdfium , which may be a safe backup choice in certain cases, e.g. if issues are seen with particular font encodings. + +## 3.2 AI models + +As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks. + +## Layout Analysis Model + +Our layout analysis model is an object-detector which predicts the bounding-boxes and classes of various elements on the image of a given page. Its architecture is derived from RT-DETR [16] and re-trained on DocLayNet [13], our popular human-annotated dataset for document-layout analysis, among other proprietary datasets. For inference, our implementation relies on the onnxruntime [5]. + +The Docling pipeline feeds page images at 72 dpi resolution, which can be processed on a single CPU with sub-second latency. All predicted bounding-box proposals for document elements are post-processed to remove overlapping proposals based on confidence and size, and then intersected with the text tokens in the PDF to group them into meaningful and complete units such as paragraphs, section titles, list items, captions, figures or tables. + +## Table Structure Recognition + +The TableFormer model [12], first published in 2022 and since refined with a custom structure token language [9], is a vision-transformer model for table structure recovery. It can predict the logical row and column structure of a given table based on an input image, and determine which table cells belong to column headers, row headers or the table body. Compared to earlier approaches, TableFormer handles many characteristics of tables, such as partial or no borderlines, empty cells, rows or columns, cell spans and hierarchy both on column-heading or row-heading level, tables with inconsistent indentation or alignment and other complexities. For inference, our implementation relies on PyTorch [2]. + +The Docling pipeline feeds all table objects detected in the layout analysis to the TableFormer model, by providing an image-crop of the table and the included text cells. TableFormer structure predictions are matched back to the PDF cells in post-processing to avoid expensive re-transcription text in the table image. Typical tables require between 2 and 6 seconds to be processed on a standard CPU, strongly depending on the amount of included table cells. + +## OCR + +Docling provides optional support for OCR, for example to cover scanned PDFs or content in bitmaps images embedded on a page. In our initial release, we rely on EasyOCR [1], a popular thirdparty OCR library with support for many languages. Docling, by default, feeds a high-resolution page image (216 dpi) to the OCR engine, to allow capturing small print detail in decent quality. While EasyOCR delivers reasonable transcription quality, we observe that it runs fairly slow on CPU (upwards of 30 seconds per page). + +We are actively seeking collaboration from the open-source community to extend Docling with additional OCR backends and speed improvements. + +## 3.3 Assembly + +In the final pipeline stage, Docling assembles all prediction results produced on each page into a well-defined datatype that encapsulates a converted document, as defined in the auxiliary package docling-core . The generated document object is passed through a post-processing model which leverages several algorithms to augment features, such as detection of the document language, correcting the reading order, matching figures with captions and labelling metadata such as title, authors and references. The final output can then be serialized to JSON or transformed into a Markdown representation at the users request. + +## 3.4 Extensibility + +Docling provides a straight-forward interface to extend its capabilities, namely the model pipeline. A model pipeline constitutes the central part in the processing, following initial document parsing and preceding output assembly, and can be fully customized by sub-classing from an abstract baseclass ( BaseModelPipeline ) or cloning the default model pipeline. This effectively allows to fully customize the chain of models, add or replace models, and introduce additional pipeline configuration parameters. To use a custom model pipeline, the custom pipeline class to instantiate can be provided as an argument to the main document conversion methods. We invite everyone in the community to propose additional or alternative models and improvements. + +Implementations of model classes must satisfy the python Callable interface. The \_\_call\_\_ method must accept an iterator over page objects, and produce another iterator over the page objects which were augmented with the additional features predicted by the model, by extending the provided PagePredictions data model accordingly. + +## 4 Performance + +In this section, we establish some reference numbers for the processing speed of Docling and the resource budget it requires. All tests in this section are run with default options on our standard test set distributed with Docling, which consists of three papers from arXiv and two IBM Redbooks, with a total of 225 pages. Measurements were taken using both available PDF backends on two different hardware systems: one MacBook Pro M3 Max, and one bare-metal server running Ubuntu 20.04 LTS on an Intel Xeon E5-2690 CPU. For reproducibility, we fixed the thread budget (through setting OMP NUM THREADS environment variable ) once to 4 (Docling default) and once to 16 (equal to full core count on the test hardware). All results are shown in Table 1. + +If you need to run Docling in very low-resource environments, please consider configuring the pypdfium backend. While it is faster and more memory efficient than the default docling-parse backend, it will come at the expense of worse quality results, especially in table structure recovery. + +Establishing GPU acceleration support for the AI models is currently work-in-progress and largely untested, but may work implicitly when CUDA is available and discovered by the onnxruntime and + torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report. +{'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} + summary: Typical Docling setup runtime characterization. type: performance data @@ -37,8 +124,6 @@ Table 1: Runtime characteristics of Docling with the standard model pipeline and | Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB | | (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB | -{'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} - ## 5 Applications Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets. @@ -54,16 +139,151 @@ We encourage everyone to propose or implement additional features and models, an - [1] J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/ JaidedAI/EasyOCR , 2024. Version: 1.7.0. - [2] J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. Voznesensky, B. Bao, P. Bell, D. Berard, E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison, W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. Lazos, M. Lezcano, Y. Liang, J. Liang, Y. Lu, C. Luk, B. Maher, Y. Pan, C. Puhrsch, M. Reso, M. Saroufim, M. Y. Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang, X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S. Chintala. Pytorch 2: Faster +machine learning through dynamic python bytecode transformation and graph compilation. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2 (ASPLOS '24) . ACM, 4 2024. doi: 10.1145/3620665.3640366. URL https://pytorch.org/assets/pytorch2-2.pdf . + +- [3] C. Auer, M. Dolfi, A. Carvalho, C. B. Ramis, and P. W. Staar. Delivering document conversion as a cloud service with high throughput and responsiveness. In 2022 IEEE 15th International Conference on Cloud Computing (CLOUD) , pages 363-373. IEEE, 2022. +- [4] J. Berkenbilt. Qpdf: A content-preserving pdf document transformer, 2024. URL https: //github.com/qpdf/qpdf . +- [5] O. R. developers. Onnx runtime. https://onnxruntime.ai/ , 2024. Version: 1.18.1. +- [6] IBM. Data Prep Kit: a community project to democratize and accelerate unstructured data preparation for LLM app developers, 2024. URL https://github.com/IBM/ data-prep-kit . +- [7] A. S. Inc. PyMuPDF, 2024. URL https://github.com/pymupdf/PyMuPDF . +- [8] J. Liu. LlamaIndex, 11 2022. URL https://github.com/jerryjliu/llama\_index . +- [9] M. Lysak, A. Nassar, N. Livathinos, C. Auer, and P. Staar. Optimized Table Tokenization for Table Structure Recognition. In Document Analysis and Recognition - ICDAR 2023: 17th International Conference, San Jos´ e, CA, USA, August 21-26, 2023, Proceedings, Part II , pages 37-50, Berlin, Heidelberg, Aug. 2023. Springer-Verlag. ISBN 978-3-031-41678-1. doi: 10. 1007/978-3-031-41679-8 3. URL https://doi.org/10.1007/978-3-031-41679-8\_3 . +- [10] L. Mishra, S. Dhibi, Y. Kim, C. Berrospi Ramis, S. Gupta, M. Dolfi, and P. Staar. Statements: Universal information extraction from tables with large language models for ESG KPIs. In D. Stammbach, J. Ni, T. Schimanski, K. Dutia, A. Singh, J. Bingler, C. Christiaen, N. Kushwaha, V. Muccione, S. A. Vaghefi, and M. Leippold, editors, Proceedings of the 1st Workshop on Natural Language Processing Meets Climate Change (ClimateNLP 2024) , pages 193-214, Bangkok, Thailand, Aug. 2024. Association for Computational Linguistics. URL https://aclanthology.org/2024.climatenlp-1.15 . +- [11] L. Morin, V. Weber, G. I. Meijer, F. Yu, and P. W. J. Staar. Patcid: an open-access dataset of chemical structures in patent documents. Nature Communications , 15(1):6532, August 2024. ISSN 2041-1723. doi: 10.1038/s41467-024-50779-y. URL https://doi.org/10.1038/ s41467-024-50779-y . +- [12] A. Nassar, N. Livathinos, M. Lysak, and P. Staar. Tableformer: Table structure understanding with transformers. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition , pages 4614-4623, 2022. +- [13] B. Pfitzmann, C. Auer, M. Dolfi, A. S. Nassar, and P. Staar. Doclaynet: a large humanannotated dataset for document-layout segmentation. pages 3743-3751, 2022. +- [14] pypdf Maintainers. pypdf: A Pure-Python PDF Library, 2024. URL https://github.com/ py-pdf/pypdf . +- [15] P. Team. PyPDFium2: Python bindings for PDFium, 2024. URL https://github.com/ pypdfium2-team/pypdfium2 . +- [16] Y. Zhao, W. Lv, S. Xu, J. Wei, G. Wang, Q. Dang, Y. Liu, and J. Chen. Detrs beat yolos on real-time object detection, 2023. + +## Appendix + +In this section, we illustrate a few examples of Docling's output in Markdown and JSON. + +## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis + +## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis + +Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com + +Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com + +Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com + +Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com + +Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com + +## ABSTRACT + +Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large ground-truth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis. + +## CCS CONCEPTS + +· Informationsystems → Documentstructure ; · Appliedcomputing → Document analysis ; · Computing methodologies → Machine learning Computer vision ; ; Object detection ; + +Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s). KDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043 + +Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com + +Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com + +Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com + +Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com + +Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com + +## ABSTRACT + +Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large groundtruth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis. + +## CCS CONCEPTS + +Æ Information systems → Document structure ; Æ Applied computing → Document analysis ; Æ Computing methodologies → Machine learning ; Computer vision ; Object detection ; + +Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s). + +KDD '22, August 14-18, 2022, Washington, DC, USA ' 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043 + +Figure 1: Four examples of complex page layouts across different document categories + +## KEYWORDS + +PDF document conversion, layout segmentation, object-detection, data set, Machine Learning + +## ACM Reference Format: + +Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043 + In this image there is a table with some text on it. +In this image there is a table with some text on it. + + + In this image we can see a text. +In this image we can see a text. + + + +AGL Energy Limited ABN 74 1 + +5 061 375 + In this image I can see the cover of the book. +In this image I can see the cover of the book. + + + In this image there is a paper with some text on it. +In this image there is a paper with some text on it. + + + +Figure 1: Four examples of complex page layouts across different document categories + +## KEYWORDS + +PDF document conversion, layout segmentation, object-detection, data set, Machine Learning + +## ACMReference Format: + +Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043 + +1 INTRODUCTION + +Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1. Figure 2: Title page of the DocLayNet paper (arxiv.org/pdf/2206.01062) - left PDF, right rendered Markdown. If recognized, metadata such as authors are appearing first under the title. Text content inside figures is currently dropped, the caption is retained and linked to the figure in the JSON representation (not shown). + +KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar + +Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset. + +| | human | MRCNN R50 R101 | FRCNN R101 | YOLO v5x6 | +|--------------------------------------------------------------------------------------------------------|-------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------|--------------------------------------------------------| +| Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All | 84-89 83-91 83-85 87-88 93-94 85-89 69-71 83-84 77-81 84-86 | 68.4 71.5 70.9 60.1 63.4 81.2 80.8 61.6 59.3 71.9 70.0 71.7 72.7 67.6 69.3 82.2 82.9 85.8 76.7 80.4 72.4 73.5 | 70.1 73.7 63.5 81.0 58.9 72.0 72.0 68.4 82.2 85.4 79.9 73.4 | 77.7 77.2 66.2 86.2 61.1 67.9 74.6 86.3 88.1 82.7 76.8 | + +to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity. + +## 5 EXPERIMENTS + +The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this + In this image, we can see a table with some text. +In this image, we can see a table with some text. + + + +Third, achienec + +## EXPERIMENTS + +chalenongayouls ground-vuth dawa such WC + The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. @@ -74,16 +294,164 @@ The graph has two lines: one for the training program and one for the percentage - **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. - **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%. +The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. + +The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. + +### Analysis: + +#### Training Program: +- **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. +- **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%. + + + +Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions. + +paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work. + +In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16]. + +## Baselines for Object Detection + +In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 × 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document. + +coioct dcochon modols + +## Baselines for Object Detection + +mak enbrel + +Figure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table. + +KDD '22, August 14-18, 2022, Washington, DC, USA + +Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar + +Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % + +between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges. + +of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric + The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A" +The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. + +The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. + +The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A" + + + In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318. +In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318. + + + +| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | +|----------------|---------|--------------|--------------|--------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------| +| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten | +| Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a | +| Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 | +| Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a | +| List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 | +| Page-footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 | +| Page-header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 | +| Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 | +| Section-header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 | +| Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 | +| Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 | +| Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 | +| Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 | + In this image I can see a blue circle. +In this image I can see a blue circle. + + + +include publication repositories such as arXiv + +Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple- + +annotated pages, from which we obtain accuracy ranges. + A table with different columns and rows. +A table with different columns and rows. + + + +| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | +|-----------------|---------|--------------|--------------|--------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------| +| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten | +| Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a | +| Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 | +| Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a | +| List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 | +| Page- footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 | +| Page- header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 | +| Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 | +| Section- header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 | +| Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 | +| Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 | +| Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 | +| Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 | + +3 + +, + +government offices, + +We reviewed the col- + +, + +Page- + +Title and + +. + +page. Specificity ensures that the choice of label is not ambiguous, + In this image there is a text in the middle. + +In this image there is a text in the middle. + + + +we distributed the annotation workload and performed continuous be annotated. We refrained from class labels that are very specific + +only. For phases three and four, a group of 40 dedicated annotators while coverage ensures that all meaningful items on a page can + +quality controls. Phase one and two required a small team of experts to a document category, such as + +Abstract in the + +Scientific Articles were assembled and supervised. + +category. We also avoided class labels that are tightly linked to the + +Phase 1: Data selection and preparation. + +Our inclusion cri- + +Author + +Affiliation + +teria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources in DocBank, are often only distinguishable by discriminating on 3 https://arxiv.org/ Figure 4: Table 1 from the DocLayNet paper in the original PDF (A), as rendered Markdown (B) and in JSON representation (C). Spanning table cells, such as the multi-column header 'triple interannotator mAP@0.5-0.95 (%)', is repeated for each column in the Markdown representation (B), which guarantees that every data point can be traced back to row and column headings only by its grid coordinates in the table. In the JSON representation, the span information is reflected in the fields of each table cell (C). + +semantics of the text. Labels such as and + +, + +as seen diff --git a/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json b/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json index 3663e9a0..59d525d3 100644 --- a/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json +++ b/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json @@ -1902,7 +1902,7 @@ ], "content_layer": "body", "meta": { - "summary": { + "description": { "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", "text": "In this image, we can see some text and images." } diff --git a/test/data/doc/activities.gt.md b/test/data/doc/activities.gt.md index 0770bf62..d03e5d7d 100644 --- a/test/data/doc/activities.gt.md +++ b/test/data/doc/activities.gt.md @@ -6,6 +6,8 @@ Duck Figure 1: This is a cute duckling + + ## Let's swim! To get started with swimming, first lay down in a water and try not to drown: diff --git a/test/data/doc/activities_p1.gt.html b/test/data/doc/activities_p1.gt.html index 18932e44..6b0b9cad 100644 --- a/test/data/doc/activities_p1.gt.html +++ b/test/data/doc/activities_p1.gt.html @@ -144,6 +144,13 @@

Let's swim!

Hmm, what else…

  • -Another activity item
  • +
  • -Yet another one
  • +
  • -Stopping it here
  • +
+

Some text.

+
    +
  • -Starting the next page with a list item.
  • +
  • -Second item.
diff --git a/test/data/doc/activities_p2.gt.html b/test/data/doc/activities_p2.gt.html index 9ab4b1be..6b0b9cad 100644 --- a/test/data/doc/activities_p2.gt.html +++ b/test/data/doc/activities_p2.gt.html @@ -124,11 +124,34 @@
+

Summer activities

+

Swimming in the lake

+

Duck

+
Figure 1: This is a cute duckling
+

Let's swim!

+

To get started with swimming, first lay down in a water and try not to drown:

    +
  • ∞ You can relax and look around
  • +
  • ∞ Paddle about
  • +
  • ∞ Enjoy summer warmth
  • +
+

Also, don't forget:

+
    +
  • 1. Wear sunglasses
  • +
  • 2. Don't forget to drink water
  • +
  • 3. Use sun cream
  • +
+

Hmm, what else…

+
    +
  • -Another activity item
  • -Yet another one
  • -Stopping it here

Some text.

+
    +
  • -Starting the next page with a list item.
  • +
  • -Second item.
  • +
diff --git a/test/data/doc/activities_p2.gt.md b/test/data/doc/activities_p2.gt.md index 4801d37b..b8910c37 100644 --- a/test/data/doc/activities_p2.gt.md +++ b/test/data/doc/activities_p2.gt.md @@ -1,4 +1,34 @@ +## Summer activities + +## Swimming in the lake + +Duck + +Figure 1: This is a cute duckling + + + +## Let's swim! + +To get started with swimming, first lay down in a water and try not to drown: + +- ∞ You can relax and look around +- ∞ Paddle about +- ∞ Enjoy summer warmth + +Also, don't forget: + +- 1. Wear sunglasses +- 2. Don't forget to drink water +- 3. Use sun cream + +Hmm, what else… + +- -Another activity item - -Yet another one - -Stopping it here Some text. + +- -Starting the next page with a list item. +- -Second item. diff --git a/test/data/doc/activities_pb_empty.gt.md b/test/data/doc/activities_pb_empty.gt.md index 185578eb..0a1695cd 100644 --- a/test/data/doc/activities_pb_empty.gt.md +++ b/test/data/doc/activities_pb_empty.gt.md @@ -6,6 +6,8 @@ Duck Figure 1: This is a cute duckling + + ## Let's swim! To get started with swimming, first lay down in a water and try not to drown: diff --git a/test/data/doc/activities_pb_non_empty.gt.md b/test/data/doc/activities_pb_non_empty.gt.md index c134cf71..e3ca76eb 100644 --- a/test/data/doc/activities_pb_non_empty.gt.md +++ b/test/data/doc/activities_pb_non_empty.gt.md @@ -6,6 +6,8 @@ Duck Figure 1: This is a cute duckling + + ## Let's swim! To get started with swimming, first lay down in a water and try not to drown: diff --git a/test/data/doc/activities_pb_none.gt.md b/test/data/doc/activities_pb_none.gt.md index 1e983a54..b8910c37 100644 --- a/test/data/doc/activities_pb_none.gt.md +++ b/test/data/doc/activities_pb_none.gt.md @@ -6,6 +6,8 @@ Duck Figure 1: This is a cute duckling + + ## Let's swim! To get started with swimming, first lay down in a water and try not to drown: diff --git a/test/data/doc/barchart.gt.md b/test/data/doc/barchart.gt.md index 2298833c..e8b988c7 100644 --- a/test/data/doc/barchart.gt.md +++ b/test/data/doc/barchart.gt.md @@ -1,3 +1,5 @@ +Bar chart + bar chart @@ -10,5 +12,3 @@ bar chart | 4 | 0.14 | 0.26 | | 5 | 0.16 | 0.25 | | 6 | 0.24 | 0.24 | - -Bar chart diff --git a/test/data/doc/checkboxes.gt.md b/test/data/doc/checkboxes.gt.md index e4303929..f9114275 100644 --- a/test/data/doc/checkboxes.gt.md +++ b/test/data/doc/checkboxes.gt.md @@ -1,3 +1,7 @@ + + + + Security Classification / Classification de sécurité Contract Number / Numéro du contrat @@ -162,6 +166,8 @@ UNCLASSIFIED + + Security Classification / Classification de sécurité Contract Number / Numéro du contrat diff --git a/test/data/doc/cross_page_lists_chunks.json b/test/data/doc/cross_page_lists_chunks.json index e7abf3c3..92ad398d 100644 --- a/test/data/doc/cross_page_lists_chunks.json +++ b/test/data/doc/cross_page_lists_chunks.json @@ -1,7 +1,7 @@ { "root": [ { - "text": "## DIVERSITY, EQUITY AND INCLUSION\n\nWe are committed to accelerate our efforts around Diversity, Equity, and Inclusion (DE&I) within Neurocrine and in the life sciences community. Our Compensation Committee provides Board oversight of our DE&I program, and our Chief Human Resources Officer has managerial responsibility for our diversity initiatives.\n\nTo help provide advice and guidance on DE&I priorities and initiatives, we have in place a DE&I Council of 11 full-time employees, including our Chief Corporate Affairs Officer as Chair and Executive Sponsor. The DE&I Council oversees priorities and initiatives that support Neurocrine's DE&I strategic framework and goals. Representing different backgrounds and roles from across the company, the DE&I Council meets monthly to discuss what is being actioned on DE&I, examine how it's working, and provide input on what else we should prioritize. As a Biocom California member organization, we are a signatory to their DE&I Member Pledge. Our action supports our commitments under this pledge.\n\nOur multi-faceted DE&I program includes the following initiatives:\n\n- \u2022 Mentorships and internship programs featuring diverse employees and students\n- \u2022 Wylie Vale Neurocrine Biosciences SD2 Scholarship , which focuses on supporting the growth and development of underrepresented collegiate students pursuing a STEM- related degree\n- \u2022 Career watch for high-potential diverse talent\n- \u2022 Build Science, Technology, Engineering and Mathematics (STEM) employee candidate pipeline via involvement with:\n - \u00bb Historically Black Colleges and Universities (HBCUs) site visits and career fairs\n - \u00bb The National Sales Network (NSN), the premier conference for Black sales professionals. Neurocrine has been a gold sponsor of the event and represented at the NSN career fair.\n - \u00bb The Ocean Discovery Institute (nonprofit organization using science to empower young people from underserved urban communities to transform their lives, their community, and our world as scientific and conservation leaders)\n - \u00bb San Diego Squared (STEM-focused nonprofit organization connecting underrepresented student to the power of STEM by providing access to education, mentorship and resources to develop STEM careers)\n\n16 Neurocrine Biosciences\n\n2024 Corporate Sustainability Report\n\n< Return to ToC >", + "text": "## DIVERSITY, EQUITY AND INCLUSION\n\nWe are committed to accelerate our efforts around Diversity, Equity, and Inclusion (DE&I) within Neurocrine and in the life sciences community. Our Compensation Committee provides Board oversight of our DE&I program, and our Chief Human Resources Officer has managerial responsibility for our diversity initiatives.\n\nTo help provide advice and guidance on DE&I priorities and initiatives, we have in place a DE&I Council of 11 full-time employees, including our Chief Corporate Affairs Officer as Chair and Executive Sponsor. The DE&I Council oversees priorities and initiatives that support Neurocrine's DE&I strategic framework and goals. Representing different backgrounds and roles from across the company, the DE&I Council meets monthly to discuss what is being actioned on DE&I, examine how it's working, and provide input on what else we should prioritize. As a Biocom California member organization, we are a signatory to their DE&I Member Pledge. Our action supports our commitments under this pledge.\n\nOur multi-faceted DE&I program includes the following initiatives:\n\n- \u2022 Mentorships and internship programs featuring diverse employees and students\n- \u2022 Wylie Vale Neurocrine Biosciences SD2 Scholarship , which focuses on supporting the growth and development of underrepresented collegiate students pursuing a STEM- related degree\n- \u2022 Career watch for high-potential diverse talent\n- \u2022 Build Science, Technology, Engineering and Mathematics (STEM) employee candidate pipeline via involvement with:\n - \u00bb Historically Black Colleges and Universities (HBCUs) site visits and career fairs\n - \u00bb The National Sales Network (NSN), the premier conference for Black sales professionals. Neurocrine has been a gold sponsor of the event and represented at the NSN career fair.\n - \u00bb The Ocean Discovery Institute (nonprofit organization using science to empower young people from underserved urban communities to transform their lives, their community, and our world as scientific and conservation leaders)\n - \u00bb San Diego Squared (STEM-focused nonprofit organization connecting underrepresented student to the power of STEM by providing access to education, mentorship and resources to develop STEM careers)\n- \u2022 Build upon DE&I employee education initiatives including:\n - \u00bb Engaging all employees, including the CEO and Management Committee, in our Unconscious Bias Learning Program, Trust Workshop, and anti-harassment and anti- discrimination training. Our anti-harassment and anti-discrimination trainings are reviewed annually.\n- \u2022 Onsite mothers' room for nursing moms\n- \u2022 Celebration and promotion of widely recognized diversity and inclusion awareness months and days including but not limited to:\n - \u00bb Asian American and Pacific Islander Heritage Month\n - \u00bb Black History Month\n - \u00bb Hispanic Heritage Month\n - \u00bb Juneteenth\n - \u00bb Pride Month\n - \u00bb Women's History Month\n\n16 Neurocrine Biosciences\n\n2024 Corporate Sustainability Report\n\n< Return to ToC >\n\n## Employee resource networks\n\nValuing the broad range of diversity at Neurocrine Biosciences, we recognize the important role that Employee Resource Networks (ERNs) play in creating an inclusive culture where all huge employees can thrive. ERNs are open to all employees to join for support and connection based on common interests, backgrounds, or demographics, promoting a more diverse, equitable, and inclusive workplace. Aimed at being educational and supportive, ERNs align with our overall DE&I strategy.\n\nERNs are supported by an Executive Sponsor and the Director of DE&I and governed by a core leadership team group of 5-6 volunteers, representing the field and corporate office. We currently have an Asian ERN, Black ERN, Christian ERN, disAbility ERN, Hispanic ERN, Young Professionals ERN, and a Women ERN, and we welcome the formation of ERNs for LGBTQIA+ people, veterans, people of all faiths, and other underrepresented groups.\n\n17 Neurocrine Biosciences\n\n2024 Corporate Sustainability Report\n\n< Return to ToC >", "meta": { "schema_name": "docling_core.transforms.chunker.DocMeta", "version": "1.0.0", @@ -306,90 +306,790 @@ } ] }, + { + "self_ref": "#/texts/15", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 53.0, + "t": 700.675, + "r": 241.03499999999997, + "b": 676.965, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 61 + ] + } + ] + }, + { + "self_ref": "#/texts/16", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 71.005, + "t": 665.675, + "r": 294.85, + "b": 574.47, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 264 + ] + } + ] + }, + { + "self_ref": "#/texts/17", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 53.0, + "t": 563.1800000000001, + "r": 252.34000000000003, + "b": 552.97, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 40 + ] + } + ] + }, + { + "self_ref": "#/texts/18", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 53.0, + "t": 545.685, + "r": 295.29999999999995, + "b": 508.47, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 131 + ] + } + ] + }, + { + "self_ref": "#/texts/19", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 71.005, + "t": 497.185, + "r": 254.35000000000002, + "b": 473.475, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 53 + ] + } + ] + }, + { + "self_ref": "#/texts/20", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 71.005, + "t": 466.185, + "r": 173.92499999999995, + "b": 455.975, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 21 + ] + } + ] + }, + { + "self_ref": "#/texts/21", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 71.005, + "t": 448.69, + "r": 198.11, + "b": 438.475, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 25 + ] + } + ] + }, + { + "self_ref": "#/texts/22", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 71.005, + "t": 431.19, + "r": 136.28499999999997, + "b": 420.975, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 12 + ] + } + ] + }, + { + "self_ref": "#/texts/23", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 71.005, + "t": 413.69, + "r": 138.40499999999997, + "b": 403.48, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 13 + ] + } + ] + }, + { + "self_ref": "#/texts/24", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 71.005, + "t": 396.19, + "r": 192.79999999999995, + "b": 385.98, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 23 + ] + } + ] + }, { "self_ref": "#/texts/12", "parent": { - "$ref": "#/body" + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "page_footer", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 53.0, + "t": 42.09500000000003, + "r": 153.175, + "b": 34.51999999999998, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 28 + ] + } + ] + }, + { + "self_ref": "#/texts/13", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "page_footer", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 153.175, + "t": 42.09000000000003, + "r": 279.82, + "b": 34.565000000000055, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 36 + ] + } + ] + }, + { + "self_ref": "#/texts/14", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "page_footer", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 501.77, + "t": 42.09000000000003, + "r": 559.0, + "b": 34.565000000000055, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 17 + ] + } + ] + }, + { + "self_ref": "#/texts/25", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "section_header", + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 315.0, + "t": 699.765, + "r": 488.70000000000005, + "b": 686.88, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 26 + ] + } + ] + }, + { + "self_ref": "#/texts/26", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 315.0, + "t": 680.55, + "r": 556.2550000000001, + "b": 535.345, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 490 + ] + } + ] + }, + { + "self_ref": "#/texts/27", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 315.0, + "t": 524.0550000000001, + "r": 561.4000000000001, + "b": 392.35, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 440 + ] + } + ] + }, + { + "self_ref": "#/texts/28", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "page_footer", + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 53.0, + "t": 42.09500000000003, + "r": 152.53499999999997, + "b": 34.51999999999998, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 28 + ] + } + ] + }, + { + "self_ref": "#/texts/29", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "page_footer", + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 152.53499999999997, + "t": 42.09000000000003, + "r": 279.17499999999995, + "b": 34.565000000000055, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 36 + ] + } + ] + }, + { + "self_ref": "#/texts/30", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "page_footer", + "prov": [ + { + "page_no": 2, + "bbox": { + "l": 501.77, + "t": 42.09000000000003, + "r": 559.0, + "b": 34.565000000000055, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 17 + ] + } + ] + } + ] + } + }, + { + "text": "## DIVERSITY, EQUITY AND INCLUSION\n\nWe are committed to accelerate our efforts around Diversity, Equity, and Inclusion (DE&I) within Neurocrine and in the life sciences community. Our Compensation Committee provides Board oversight of our DE&I program, and our Chief Human Resources Officer has managerial responsibility for our diversity initiatives.\n\nTo help provide advice and guidance on DE&I priorities and initiatives, we have in place a DE&I Council of 11 full-time employees, including our Chief Corporate Affairs Officer as Chair and Executive Sponsor. The DE&I Council oversees priorities and initiatives that support Neurocrine's DE&I strategic framework and goals. Representing different backgrounds and roles from across the company, the DE&I Council meets monthly to discuss what is being actioned on DE&I, examine how it's working, and provide input on what else we should prioritize. As a Biocom California member organization, we are a signatory to their DE&I Member Pledge. Our action supports our commitments under this pledge.\n\nOur multi-faceted DE&I program includes the following initiatives:\n\n- \u2022 Mentorships and internship programs featuring diverse employees and students\n- \u2022 Wylie Vale Neurocrine Biosciences SD2 Scholarship , which focuses on supporting the growth and development of underrepresented collegiate students pursuing a STEM- related degree\n- \u2022 Career watch for high-potential diverse talent\n- \u2022 Build Science, Technology, Engineering and Mathematics (STEM) employee candidate pipeline via involvement with:\n - \u00bb Historically Black Colleges and Universities (HBCUs) site visits and career fairs\n - \u00bb The National Sales Network (NSN), the premier conference for Black sales professionals. Neurocrine has been a gold sponsor of the event and represented at the NSN career fair.\n - \u00bb The Ocean Discovery Institute (nonprofit organization using science to empower young people from underserved urban communities to transform their lives, their community, and our world as scientific and conservation leaders)\n - \u00bb San Diego Squared (STEM-focused nonprofit organization connecting underrepresented student to the power of STEM by providing access to education, mentorship and resources to develop STEM careers)\n- \u2022 Build upon DE&I employee education initiatives including:\n - \u00bb Engaging all employees, including the CEO and Management Committee, in our Unconscious Bias Learning Program, Trust Workshop, and anti-harassment and anti- discrimination training. Our anti-harassment and anti-discrimination trainings are reviewed annually.\n- \u2022 Onsite mothers' room for nursing moms\n- \u2022 Celebration and promotion of widely recognized diversity and inclusion awareness months and days including but not limited to:\n - \u00bb Asian American and Pacific Islander Heritage Month\n - \u00bb Black History Month\n - \u00bb Hispanic Heritage Month\n - \u00bb Juneteenth\n - \u00bb Pride Month\n - \u00bb Women's History Month\n\n16 Neurocrine Biosciences\n\n2024 Corporate Sustainability Report\n\n< Return to ToC >\n\n## Employee resource networks\n\nValuing the broad range of diversity at Neurocrine Biosciences, we recognize the important role that Employee Resource Networks (ERNs) play in creating an inclusive culture where all huge employees can thrive. ERNs are open to all employees to join for support and connection based on common interests, backgrounds, or demographics, promoting a more diverse, equitable, and inclusive workplace. Aimed at being educational and supportive, ERNs align with our overall DE&I strategy.\n\nERNs are supported by an Executive Sponsor and the Director of DE&I and governed by a core leadership team group of 5-6 volunteers, representing the field and corporate office. We currently have an Asian ERN, Black ERN, Christian ERN, disAbility ERN, Hispanic ERN, Young Professionals ERN, and a Women ERN, and we welcome the formation of ERNs for LGBTQIA+ people, veterans, people of all faiths, and other underrepresented groups.\n\n17 Neurocrine Biosciences\n\n2024 Corporate Sustainability Report\n\n< Return to ToC >", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "section_header", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 53.0, + "t": 688.815, + "r": 319.925, + "b": 673.665, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 31 + ] + } + ] + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 53.0, + "t": 632.15, + "r": 291.99, + "b": 540.94, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 321 + ] + } + ] + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 53.0, + "t": 529.655, + "r": 289.795, + "b": 330.45, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 707 + ] + } + ] + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 315.0, + "t": 632.16, + "r": 535.95, + "b": 608.38, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 67 + ] + } + ] + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/groups/0" }, "children": [], "content_layer": "body", - "label": "page_footer", + "label": "list_item", "prov": [ { "page_no": 1, "bbox": { - "l": 53.0, - "t": 42.09500000000003, - "r": 153.175, - "b": 34.51999999999998, + "l": 315.0, + "t": 597.15, + "r": 551.98, + "b": 573.44, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, - 28 + 80 ] } ] }, { - "self_ref": "#/texts/13", + "self_ref": "#/texts/5", "parent": { - "$ref": "#/body" + "$ref": "#/groups/0" }, "children": [], "content_layer": "body", - "label": "page_footer", + "label": "list_item", "prov": [ { "page_no": 1, "bbox": { - "l": 153.175, - "t": 42.09000000000003, - "r": 279.82, - "b": 34.565000000000055, + "l": 315.0, + "t": 566.155, + "r": 541.83, + "b": 501.945, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, - 36 + 184 ] } ] }, { - "self_ref": "#/texts/14", + "self_ref": "#/texts/6", "parent": { - "$ref": "#/body" + "$ref": "#/groups/0" }, "children": [], "content_layer": "body", - "label": "page_footer", + "label": "list_item", "prov": [ { "page_no": 1, "bbox": { - "l": 501.77, - "t": 42.09000000000003, - "r": 559.0, - "b": 34.565000000000055, + "l": 315.0, + "t": 494.655, + "r": 543.115, + "b": 484.445, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, - 17 + 49 ] } ] - } - ] - } - }, - { - "text": "- \u2022 Build upon DE&I employee education initiatives including:\n - \u00bb Engaging all employees, including the CEO and Management Committee, in our Unconscious Bias Learning Program, Trust Workshop, and anti-harassment and anti- discrimination training. Our anti-harassment and anti-discrimination trainings are reviewed annually.\n- \u2022 Onsite mothers' room for nursing moms\n- \u2022 Celebration and promotion of widely recognized diversity and inclusion awareness months and days including but not limited to:\n - \u00bb Asian American and Pacific Islander Heritage Month\n - \u00bb Black History Month\n - \u00bb Hispanic Heritage Month\n - \u00bb Juneteenth\n - \u00bb Pride Month\n - \u00bb Women's History Month\n\n## Employee resource networks\n\nValuing the broad range of diversity at Neurocrine Biosciences, we recognize the important role that Employee Resource Networks (ERNs) play in creating an inclusive culture where all huge employees can thrive. ERNs are open to all employees to join for support and connection based on common interests, backgrounds, or demographics, promoting a more diverse, equitable, and inclusive workplace. Aimed at being educational and supportive, ERNs align with our overall DE&I strategy.\n\nERNs are supported by an Executive Sponsor and the Director of DE&I and governed by a core leadership team group of 5-6 volunteers, representing the field and corporate office. We currently have an Asian ERN, Black ERN, Christian ERN, disAbility ERN, Hispanic ERN, Young Professionals ERN, and a Women ERN, and we welcome the formation of ERNs for LGBTQIA+ people, veterans, people of all faiths, and other underrepresented groups.\n\n17 Neurocrine Biosciences\n\n2024 Corporate Sustainability Report\n\n< Return to ToC >", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 315.0, + "t": 477.155, + "r": 531.59, + "b": 439.945, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 116 + ] + } + ] + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 333.005, + "t": 428.66, + "r": 545.025, + "b": 404.95, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 84 + ] + } + ] + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 333.005, + "t": 397.66, + "r": 561.63, + "b": 346.95, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 180 + ] + } + ] + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 333.005, + "t": 339.665, + "r": 550.675, + "b": 261.95500000000004, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 231 + ] + } + ] + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 333.005, + "t": 254.66499999999996, + "r": 557.23, + "b": 190.46000000000004, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 202 + ] + } + ] + }, { "self_ref": "#/texts/15", "parent": { @@ -640,6 +1340,81 @@ } ] }, + { + "self_ref": "#/texts/12", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "page_footer", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 53.0, + "t": 42.09500000000003, + "r": 153.175, + "b": 34.51999999999998, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 28 + ] + } + ] + }, + { + "self_ref": "#/texts/13", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "page_footer", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 153.175, + "t": 42.09000000000003, + "r": 279.82, + "b": 34.565000000000055, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 36 + ] + } + ] + }, + { + "self_ref": "#/texts/14", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "page_footer", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 501.77, + "t": 42.09000000000003, + "r": 559.0, + "b": 34.565000000000055, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 17 + ] + } + ] + }, { "self_ref": "#/texts/25", "parent": { diff --git a/test/data/doc/dummy_doc.yaml.md b/test/data/doc/dummy_doc.yaml.md index c018c3f4..bd4e6b23 100644 --- a/test/data/doc/dummy_doc.yaml.md +++ b/test/data/doc/dummy_doc.yaml.md @@ -1,9 +1,5 @@ # DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis -Figure 1: Four examples of complex page layouts across different document categories - - - ... Bar chart @@ -12,6 +8,10 @@ CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 {'myanalysis': {'prediction': 'abc'}, 'something_else': {'text': 'aaa'}} +Figure 1: Four examples of complex page layouts across different document categories + + + A description annotation for this table. {'foo': 'bar'} diff --git a/test/data/doc/dummy_doc_2_prec.yaml b/test/data/doc/dummy_doc_2_prec.yaml index 75b2b41c..31fa5fdd 100644 --- a/test/data/doc/dummy_doc_2_prec.yaml +++ b/test/data/doc/dummy_doc_2_prec.yaml @@ -86,6 +86,9 @@ pictures: - class_name: bar_chart confidence: 0.8 created_by: model1 + description: + created_by: model2 + text: '...' docling_legacy__misc: myanalysis: prediction: abc @@ -105,9 +108,6 @@ pictures: - - 1.0 - 1.0 smi: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 - summary: - created_by: model2 - text: '...' parent: $ref: '#/body' prov: @@ -168,11 +168,11 @@ tables: uri:  label: table meta: - docling_legacy__misc: - foo: bar - summary: + description: created_by: model3 text: A description annotation for this table. + docling_legacy__misc: + foo: bar parent: $ref: '#/body' prov: diff --git a/test/data/doc/dummy_doc_legacy_annotations.md b/test/data/doc/dummy_doc_legacy_annotations.md deleted file mode 100644 index 328545e7..00000000 --- a/test/data/doc/dummy_doc_legacy_annotations.md +++ /dev/null @@ -1,13 +0,0 @@ -# DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis - -Figure 1: Four examples of complex page layouts across different document categories - -bar chart - -... - -CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 - - - -A description annotation for this table. \ No newline at end of file diff --git a/test/data/doc/dummy_doc_mark_meta.md b/test/data/doc/dummy_doc_mark_meta.md index 8b9432e6..f9f8cdf4 100644 --- a/test/data/doc/dummy_doc_mark_meta.md +++ b/test/data/doc/dummy_doc_mark_meta.md @@ -1,10 +1,6 @@ # DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis -Figure 1: Four examples of complex page layouts across different document categories - - - -[Summary] ... +[Description] ... [Classification] Bar chart @@ -12,6 +8,10 @@ Figure 1: Four examples of complex page layouts across different document catego [Docling Legacy Misc] {'myanalysis': {'prediction': 'abc'}, 'something_else': {'text': 'aaa'}} -[Summary] A description annotation for this table. +Figure 1: Four examples of complex page layouts across different document categories + + + +[Description] A description annotation for this table. [Docling Legacy Misc] {'foo': 'bar'} \ No newline at end of file diff --git a/test/data/doc/dummy_doc_with_meta_modified.yaml b/test/data/doc/dummy_doc_with_meta_modified.yaml index ffc58fa4..a58a566b 100644 --- a/test/data/doc/dummy_doc_with_meta_modified.yaml +++ b/test/data/doc/dummy_doc_with_meta_modified.yaml @@ -86,6 +86,9 @@ pictures: - class_name: bar_chart confidence: 0.78 created_by: model1 + description: + created_by: model2 + text: '...' docling_legacy__misc: myanalysis: prediction: abc @@ -105,9 +108,6 @@ pictures: - - 1.0 - 1.0 smi: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 - summary: - created_by: model2 - text: '...' parent: $ref: '#/body' prov: @@ -168,11 +168,11 @@ tables: uri:  label: table meta: - docling_legacy__misc: - foo: bar - summary: + description: created_by: model3 text: A description annotation for this table. + docling_legacy__misc: + foo: bar parent: $ref: '#/body' prov: diff --git a/test/data/doc/group_with_metadata.md b/test/data/doc/group_with_metadata.md deleted file mode 100644 index d33aa19c..00000000 --- a/test/data/doc/group_with_metadata.md +++ /dev/null @@ -1,5 +0,0 @@ -This part talks about foo and bar. - -Foo - -Bar \ No newline at end of file diff --git a/test/data/doc/group_with_metadata.yaml b/test/data/doc/group_with_metadata.yaml index c4e71222..b9353204 100644 --- a/test/data/doc/group_with_metadata.yaml +++ b/test/data/doc/group_with_metadata.yaml @@ -1,10 +1,11 @@ body: children: - $ref: '#/groups/0' - - $ref: '#/texts/0' - - $ref: '#/texts/1' content_layer: body label: unspecified + meta: + summary: + text: This document talks about various topics. name: _root_ self_ref: '#/body' form_items: [] @@ -15,16 +16,55 @@ furniture: name: _root_ self_ref: '#/furniture' groups: -- children: [] +- children: + - $ref: '#/texts/0' + - $ref: '#/groups/1' + - $ref: '#/groups/3' content_layer: body - label: unspecified + label: chapter meta: + my_corp__test: value summary: - text: This part talks about foo and bar. - name: group + text: This chapter discusses foo and bar. + name: '1' parent: $ref: '#/body' self_ref: '#/groups/0' +- children: + - $ref: '#/texts/1' + - $ref: '#/groups/2' + content_layer: body + label: section + meta: + summary: + text: This section talks about foo. + name: 1a + parent: + $ref: '#/groups/0' + self_ref: '#/groups/1' +- children: + - $ref: '#/texts/2' + - $ref: '#/texts/3' + content_layer: body + label: list + meta: + summary: + text: Here some foo specifics are listed. + name: group + parent: + $ref: '#/groups/1' + self_ref: '#/groups/2' +- children: + - $ref: '#/texts/4' + content_layer: body + label: section + meta: + summary: + text: This section talks about bar. + name: 1b + parent: + $ref: '#/groups/0' + self_ref: '#/groups/3' key_value_items: [] name: '' pages: {} @@ -35,19 +75,53 @@ texts: - children: [] content_layer: body label: text - orig: Foo + orig: This is some introductory text. parent: - $ref: '#/body' + $ref: '#/groups/0' prov: [] self_ref: '#/texts/0' - text: Foo + text: This is some introductory text. - children: [] content_layer: body label: text - orig: Bar + meta: + summary: + text: This paragraph provides more details about foo. + orig: Regarding foo... parent: - $ref: '#/body' + $ref: '#/groups/1' prov: [] self_ref: '#/texts/1' - text: Bar + text: Regarding foo... +- children: [] + content_layer: body + enumerated: true + label: list_item + marker: '' + orig: lorem + parent: + $ref: '#/groups/2' + prov: [] + self_ref: '#/texts/2' + text: lorem +- children: [] + content_layer: body + enumerated: true + label: list_item + marker: '' + orig: ipsum + parent: + $ref: '#/groups/2' + prov: [] + self_ref: '#/texts/3' + text: ipsum +- children: [] + content_layer: body + label: text + orig: Regarding bar... + parent: + $ref: '#/groups/3' + prov: [] + self_ref: '#/texts/4' + text: Regarding bar... version: 1.7.0 diff --git a/test/data/doc/group_with_metadata_default.md b/test/data/doc/group_with_metadata_default.md new file mode 100644 index 00000000..157acfa4 --- /dev/null +++ b/test/data/doc/group_with_metadata_default.md @@ -0,0 +1,22 @@ +This document talks about various topics. + +This chapter discusses foo and bar. + +value + +This is some introductory text. + +This section talks about foo. + +This paragraph provides more details about foo. + +Regarding foo... + +Here some foo specifics are listed. + +1. lorem +2. ipsum + +This section talks about bar. + +Regarding bar... \ No newline at end of file diff --git a/test/data/doc/group_with_metadata_marked.md b/test/data/doc/group_with_metadata_marked.md new file mode 100644 index 00000000..5393569e --- /dev/null +++ b/test/data/doc/group_with_metadata_marked.md @@ -0,0 +1,22 @@ +[Summary] This document talks about various topics. + +[Summary] This chapter discusses foo and bar. + +[My Corp Test] value + +This is some introductory text. + +[Summary] This section talks about foo. + +[Summary] This paragraph provides more details about foo. + +Regarding foo... + +[Summary] Here some foo specifics are listed. + +1. lorem +2. ipsum + +[Summary] This section talks about bar. + +Regarding bar... \ No newline at end of file diff --git a/test/data/doc/group_with_metadata_summaries.md b/test/data/doc/group_with_metadata_summaries.md new file mode 100644 index 00000000..84d0b6dd --- /dev/null +++ b/test/data/doc/group_with_metadata_summaries.md @@ -0,0 +1,6 @@ +[#/body] [GroupItem:unspecified] This document talks about various topics. + [#/groups/0] [GroupItem:chapter] This chapter discusses foo and bar. + [#/groups/1] [GroupItem:section] This section talks about foo. + [#/texts/1] [TextItem:text] This paragraph provides more details about foo. + [#/groups/2] [ListGroup:list] Here some foo specifics are listed. + [#/groups/3] [GroupItem:section] This section talks about bar. \ No newline at end of file diff --git a/test/test_metadata.py b/test/test_metadata.py index 86a2be28..ddea9157 100644 --- a/test/test_metadata.py +++ b/test/test_metadata.py @@ -1,16 +1,26 @@ from pathlib import Path +from typing import Any, Optional import pytest from pydantic import BaseModel - +from typing_extensions import override + +from docling_core.transforms.serializer.base import SerializationResult +from docling_core.transforms.serializer.common import create_ser_result +from docling_core.transforms.serializer.markdown import ( + MarkdownDocSerializer, + MarkdownMetaSerializer, + MarkdownParams, +) from docling_core.types.doc.document import ( BaseMeta, + DocItem, DoclingDocument, NodeItem, RefItem, SummaryMetaField, ) -from docling_core.types.doc.labels import DocItemLabel +from docling_core.types.doc.labels import DocItemLabel, GroupLabel from .test_data_gen_flag import GEN_TEST_DATA @@ -64,16 +74,43 @@ def test_namespace_absence_raises(): def _create_doc_with_group_with_metadata() -> DoclingDocument: doc = DoclingDocument(name="") - grp = doc.add_group() - grp.meta = BaseMeta( - summary=SummaryMetaField(text="This part talks about foo and bar.") + doc.body.meta = BaseMeta( + summary=SummaryMetaField(text="This document talks about various topics.") + ) + grp1 = doc.add_group(name="1", label=GroupLabel.CHAPTER) + grp1.meta = BaseMeta( + summary=SummaryMetaField(text="This chapter discusses foo and bar.") + ) + doc.add_text( + text="This is some introductory text.", label=DocItemLabel.TEXT, parent=grp1 + ) + + grp1a = doc.add_group(parent=grp1, name="1a", label=GroupLabel.SECTION) + grp1a.meta = BaseMeta( + summary=SummaryMetaField(text="This section talks about foo.") + ) + grp1.meta.set_custom_field(namespace="my_corp", name="test", value="value") + txt1 = doc.add_text(text="Regarding foo...", label=DocItemLabel.TEXT, parent=grp1a) + txt1.meta = BaseMeta( + summary=SummaryMetaField(text="This paragraph provides more details about foo.") ) - doc.add_text(text="Foo", label=DocItemLabel.TEXT) - doc.add_text(text="Bar", label=DocItemLabel.TEXT) + lst1a = doc.add_list_group(parent=grp1a) + lst1a.meta = BaseMeta( + summary=SummaryMetaField(text="Here some foo specifics are listed.") + ) + doc.add_list_item(text="lorem", parent=lst1a, enumerated=True) + doc.add_list_item(text="ipsum", parent=lst1a, enumerated=True) + + grp1b = doc.add_group(parent=grp1, name="1b", label=GroupLabel.SECTION) + grp1b.meta = BaseMeta( + summary=SummaryMetaField(text="This section talks about bar.") + ) + doc.add_text(text="Regarding bar...", label=DocItemLabel.TEXT, parent=grp1b) + return doc -def test_group_with_metadata(): +def test_ser_deser(): doc = _create_doc_with_group_with_metadata() # test dumping to and loading from YAML @@ -84,38 +121,133 @@ def test_group_with_metadata(): expected = DoclingDocument.load_from_yaml(filename=exp_file) assert doc == expected + +def test_md_ser_default(): + doc = _create_doc_with_group_with_metadata() + # test exporting to Markdown - exp_file = exp_file.with_suffix(".md") + params = MarkdownParams( + include_annotations=False, + ) + ser = MarkdownDocSerializer(doc=doc, params=params) + ser_res = ser.serialize() + actual = ser_res.text + exp_file = Path("test/data/doc/group_with_metadata_default.md") if GEN_TEST_DATA: - doc.save_as_markdown(filename=exp_file) + with open(exp_file, "w", encoding="utf-8") as f: + f.write(actual) else: - actual = doc.export_to_markdown() with open(exp_file, "r", encoding="utf-8") as f: expected = f.read() assert actual == expected -def test_legacy_annotations(): - inp = Path("test/data/doc/dummy_doc.yaml") - doc = DoclingDocument.load_from_yaml(filename=inp) - exp_file = inp.parent / f"{inp.stem}_legacy_annotations.md" +def test_md_ser_marked(): + doc = _create_doc_with_group_with_metadata() + + # test exporting to Markdown + params = MarkdownParams( + include_annotations=False, + mark_meta=True, + ) + ser = MarkdownDocSerializer(doc=doc, params=params) + ser_res = ser.serialize() + actual = ser_res.text + exp_file = Path("test/data/doc/group_with_metadata_marked.md") if GEN_TEST_DATA: - doc.save_as_markdown(filename=exp_file, use_legacy_annotations=True) + with open(exp_file, "w", encoding="utf-8") as f: + f.write(actual) else: - actual = doc.export_to_markdown(use_legacy_annotations=True) with open(exp_file, "r", encoding="utf-8") as f: expected = f.read() assert actual == expected -def test_mark_meta(): - inp = Path("test/data/doc/dummy_doc.yaml") - doc = DoclingDocument.load_from_yaml(filename=inp) - exp_file = inp.parent / f"{inp.stem}_mark_meta.md" +def test_ser_custom_meta_serializer(): + + class SummaryMarkdownMetaSerializer(MarkdownMetaSerializer): + + @override + def serialize( + self, + *, + item: NodeItem, + doc: DoclingDocument, + level: Optional[int] = None, + **kwargs: Any, + ) -> SerializationResult: + """Serialize the item's meta.""" + params = MarkdownParams(**kwargs) + return create_ser_result( + text="\n\n".join( + [ + f"{' ' * (level or 0)}[{item.self_ref}] [{item.__class__.__name__}:{item.label.value}] {tmp}" # type:ignore[attr-defined] + for key in ( + list(item.meta.__class__.model_fields) + + list(item.meta.get_custom_part()) + ) + if ( + tmp := self._serialize_meta_field( + item.meta, key, params.mark_meta + ) + ) + ] + if item.meta + else [] + ), + span_source=item if isinstance(item, DocItem) else [], + ) + + def _serialize_meta_field( + self, meta: BaseMeta, name: str, mark_meta: bool + ) -> Optional[str]: + if (field_val := getattr(meta, name)) is not None and isinstance( + field_val, SummaryMetaField + ): + txt = field_val.text + return ( + f"[{self._humanize_text(name, title=True)}] {txt}" + if mark_meta + else txt + ) + else: + return None + + class SummaryMarkdownDocSerializer(MarkdownDocSerializer): + # just for overriding the delimiter to single newline: + @override + def serialize_doc( + self, + *, + parts: list[SerializationResult], + **kwargs: Any, + ) -> SerializationResult: + """Serialize a document out of its parts.""" + text_res = "\n".join([p.text for p in parts if p.text]) + if self.requires_page_break(): + page_sep = self.params.page_break_placeholder or "" + for full_match, _, _ in self._get_page_breaks(text=text_res): + text_res = text_res.replace(full_match, page_sep) + + return create_ser_result(text=text_res, span_source=parts) + + doc = _create_doc_with_group_with_metadata() + + # test exporting to Markdown + params = MarkdownParams( + include_annotations=False, + include_non_meta=False, + ) + ser = SummaryMarkdownDocSerializer( + doc=doc, params=params, meta_serializer=SummaryMarkdownMetaSerializer() + ) + ser_res = ser.serialize() + actual = ser_res.text + exp_file = Path("test/data/doc/group_with_metadata_summaries.md") if GEN_TEST_DATA: - doc.save_as_markdown(filename=exp_file, mark_meta=True) + with open(exp_file, "w", encoding="utf-8") as f: + f.write(actual) else: - actual = doc.export_to_markdown(mark_meta=True) with open(exp_file, "r", encoding="utf-8") as f: expected = f.read() assert actual == expected From a8af63f359ebf150d1f4ac5d232a3b0a5c42699d Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Tue, 28 Oct 2025 17:16:33 +0100 Subject: [PATCH 10/22] simplify customization Signed-off-by: Panos Vagenas --- .../data/doc/group_with_metadata_summaries.md | 5 +++ test/test_metadata.py | 36 +++++++++---------- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/test/data/doc/group_with_metadata_summaries.md b/test/data/doc/group_with_metadata_summaries.md index 84d0b6dd..45b927ec 100644 --- a/test/data/doc/group_with_metadata_summaries.md +++ b/test/data/doc/group_with_metadata_summaries.md @@ -1,6 +1,11 @@ [#/body] [GroupItem:unspecified] This document talks about various topics. + [#/groups/0] [GroupItem:chapter] This chapter discusses foo and bar. + [#/groups/1] [GroupItem:section] This section talks about foo. + [#/texts/1] [TextItem:text] This paragraph provides more details about foo. + [#/groups/2] [ListGroup:list] Here some foo specifics are listed. + [#/groups/3] [GroupItem:section] This section talks about bar. \ No newline at end of file diff --git a/test/test_metadata.py b/test/test_metadata.py index ddea9157..448ca327 100644 --- a/test/test_metadata.py +++ b/test/test_metadata.py @@ -213,23 +213,23 @@ def _serialize_meta_field( else: return None - class SummaryMarkdownDocSerializer(MarkdownDocSerializer): - # just for overriding the delimiter to single newline: - @override - def serialize_doc( - self, - *, - parts: list[SerializationResult], - **kwargs: Any, - ) -> SerializationResult: - """Serialize a document out of its parts.""" - text_res = "\n".join([p.text for p in parts if p.text]) - if self.requires_page_break(): - page_sep = self.params.page_break_placeholder or "" - for full_match, _, _ in self._get_page_breaks(text=text_res): - text_res = text_res.replace(full_match, page_sep) - - return create_ser_result(text=text_res, span_source=parts) + # class SummaryMarkdownDocSerializer(MarkdownDocSerializer): + # # just for overriding the delimiter to single newline: + # @override + # def serialize_doc( + # self, + # *, + # parts: list[SerializationResult], + # **kwargs: Any, + # ) -> SerializationResult: + # """Serialize a document out of its parts.""" + # text_res = "\n".join([p.text for p in parts if p.text]) + # if self.requires_page_break(): + # page_sep = self.params.page_break_placeholder or "" + # for full_match, _, _ in self._get_page_breaks(text=text_res): + # text_res = text_res.replace(full_match, page_sep) + + # return create_ser_result(text=text_res, span_source=parts) doc = _create_doc_with_group_with_metadata() @@ -238,7 +238,7 @@ def serialize_doc( include_annotations=False, include_non_meta=False, ) - ser = SummaryMarkdownDocSerializer( + ser = MarkdownDocSerializer( doc=doc, params=params, meta_serializer=SummaryMarkdownMetaSerializer() ) ser_res = ser.serialize() From 18b91443bb1e98ff93f4226075fd147a33956bb8 Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Tue, 28 Oct 2025 17:40:01 +0100 Subject: [PATCH 11/22] fix reference exclusion Signed-off-by: Panos Vagenas --- docling_core/transforms/serializer/common.py | 4 +- test/data/doc/2206.01062.yaml.pages.dt | 108 --- test/data/doc/2408.09869v3_enriched.gt.md | 348 -------- ...iched_p1_include_annotations_false.gt.html | 162 ---- ...nriched_p1_include_annotations_false.gt.md | 332 ------- ...riched_p1_include_annotations_true.gt.html | 185 ---- ...3_enriched_p1_mark_annotations_false.gt.md | 368 -------- ...v3_enriched_p1_mark_annotations_true.gt.md | 368 -------- test/data/doc/activities.gt.md | 2 - test/data/doc/activities_p1.gt.html | 7 - test/data/doc/activities_p2.gt.html | 23 - test/data/doc/activities_p2.gt.md | 30 - test/data/doc/activities_pb_empty.gt.md | 2 - test/data/doc/activities_pb_non_empty.gt.md | 2 - test/data/doc/activities_pb_none.gt.md | 2 - test/data/doc/checkboxes.gt.md | 6 - test/data/doc/cross_page_lists_chunks.json | 845 +----------------- 17 files changed, 37 insertions(+), 2757 deletions(-) diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py index b08ee88c..288c46c3 100644 --- a/docling_core/transforms/serializer/common.py +++ b/docling_core/transforms/serializer/common.py @@ -81,7 +81,7 @@ def _iterate_items( traverse_pictures: bool = False, add_page_breaks: bool = False, visited: Optional[set[str]] = None, -): +) -> Iterable[Tuple[NodeItem, int]]: my_visited: set[str] = visited if visited is not None else set() prev_page_nr: Optional[int] = None page_break_i = 0 @@ -262,7 +262,7 @@ def get_excluded_refs(self, **kwargs: Any) -> set[str]: if refs is None: refs = { item.self_ref - for ix, item in enumerate( + for ix, (item, _) in enumerate( _iterate_items( doc=self.doc, traverse_pictures=True, diff --git a/test/data/doc/2206.01062.yaml.pages.dt b/test/data/doc/2206.01062.yaml.pages.dt index 95b52cc7..44f29e05 100644 --- a/test/data/doc/2206.01062.yaml.pages.dt +++ b/test/data/doc/2206.01062.yaml.pages.dt @@ -43,118 +43,10 @@ DocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape of labeled, rectangular boundingboxes. We define 11 distinct labels for layout features, namely Caption , Footnote , Formula List-item , , Page-footer , Page-header , Picture , Section-header , Table , Text , and Title . Our reasoning for picking this particular label set is detailed in Section 4. In addition to open intellectual property constraints for the source documents, we required that the documents in DocLayNet adhere to a few conditions. Firstly, we kept scanned documents -DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis -KDD '22, August 14-18, 2022, Washington, DC, USA -Figure 2: Distribution of DocLayNet pages across document categories. -to a minimum, since they introduce difficulties in annotation (see Section 4). As a second condition, we focussed on medium to large documents ( > 10 pages) with technical content, dense in complex tables, figures, plots and captions. Such documents carry a lot of information value, but are often hard to analyse with high accuracy due to their challenging layouts. Counterexamples of documents not included in the dataset are receipts, invoices, hand-written documents or photographs showing 'text in the wild". -The pages in DocLayNet can be grouped into six distinct categories, namely Financial Reports , Manuals Scientific Articles , , Laws & Regulations , Patents and Government Tenders . Each document category was sourced from various repositories. For example, Financial Reports contain both free-style format annual reports 2 which expose company-specific, artistic layouts as well as the more formal SEC filings. The two largest categories ( Financial Reports and Manuals ) contain a large amount of free-style layouts in order to obtain maximum variability. In the other four categories, we boosted the variability by mixing documents from independent providers, such as different government websites or publishers. In Figure 2, we show the document categories contained in DocLayNet with their respective sizes. -We did not control the document selection with regard to language. The vast majority of documents contained in DocLayNet (close to 95%) are published in English language. However, DocLayNet also contains a number of documents in other languages such as German (2.5%), French (1.0%) and Japanese (1.0%). While the document language has negligible impact on the performance of computer vision methods such as object detection and segmentation models, it might prove challenging for layout analysis methods which exploit textual features. -To ensure that future benchmarks in the document-layout analysis community can be easily compared, we have split up DocLayNet into pre-defined train-, test- and validation-sets. In this way, we can avoid spurious variations in the evaluation scores due to random splitting in train-, test- and validation-sets. We also ensured that less frequent labels are represented in train and test sets in equal proportions. -2 e.g. AAPL from https://www.annualreports.com/ -Table 1 shows the overall frequency and distribution of the labels among the different sets. Importantly, we ensure that subsets are only split on full-document boundaries. This avoids that pages of the same document are spread over train, test and validation set, which can give an undesired evaluation advantage to models and lead to overestimation of their prediction accuracy. We will show the impact of this decision in Section 5. -In order to accommodate the different types of models currently in use by the community, we provide DocLayNet in an augmented COCO format [16]. This entails the standard COCO ground-truth file (in JSON format) with the associated page images (in PNG format, 1025 × 1025 pixels). Furthermore, custom fields have been added to each COCO record to specify document category, original document filename and page number. In addition, we also provide the original PDF pages, as well as sidecar files containing parsed PDF text and text-cell coordinates (in JSON). All additional files are linked to the primary page images by their matching filenames. -Despite being cost-intense and far less scalable than automation, human annotation has several benefits over automated groundtruth generation. The first and most obvious reason to leverage human annotations is the freedom to annotate any type of document without requiring a programmatic source. For most PDF documents, the original source document is not available. The latter is not a hard constraint with human annotation, but it is for automated methods. A second reason to use human annotations is that the latter usually provide a more natural interpretation of the page layout. The human-interpreted layout can significantly deviate from the programmatic layout used in typesetting. For example, 'invisible' tables might be used solely for aligning text paragraphs on columns. Such typesetting tricks might be interpreted by automated methods incorrectly as an actual table, while the human annotation will interpret it correctly as Text or other styles. The same applies to multi-line text elements, when authors decided to space them as 'invisible' list elements without bullet symbols. A third reason to gather ground-truth through human annotation is to estimate a 'natural' upper bound on the segmentation accuracy. As we will show in Section 4, certain documents featuring complex layouts can have different but equally acceptable layout interpretations. This natural upper bound for segmentation accuracy can be found by annotating the same pages multiple times by different people and evaluating the inter-annotator agreement. Such a baseline consistency evaluation is very useful to define expectations for a good target accuracy in trained deep neural network models and avoid overfitting (see Table 1). On the flip side, achieving high annotation consistency proved to be a key challenge in human annotation, as we outline in Section 4. -4 ANNOTATION CAMPAIGN -The annotation campaign was carried out in four phases. In phase one, we identified and prepared the data sources for annotation. In phase two, we determined the class labels and how annotations should be done on the documents in order to obtain maximum consistency. The latter was guided by a detailed requirement analysis and exhaustive experiments. In phase three, we trained the annotation staff and performed exams for quality assurance. In phase four, -KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar -% of Totaltriple inter-annotator mAP @ 0.5-0.95 (%)class labelCountTrainTestValAllFinManSciLawPatTenCaption225242.041.772.3284-8940-6186-9294-9995-9969-78n/aFootnote63180.600.310.5883-91n/a10062-8885-94n/a82-97Formula250272.251.902.9683-85n/an/a84-8786-96n/an/aList-item18566017.1913.3415.8287-8874-8390-9297-9781-8575-8893-95Page-footer708786.515.586.0093-9488-9095-9610092-9710096-98Page-header580225.106.705.0685-8966-7690-9498-10091-9297-9981-86Picture459764.212.785.3169-7156-5982-8669-8280-9566-7159-76Section-header14288412.6015.7712.8583-8476-8190-9294-9587-9469-7378-86Table347333.202.273.6077-8175-8083-8698-9958-8079-8470-85Text51037745.8249.2845.0084-8681-8688-9389-9387-9271-7987-95Title50710.470.300.5060-7224-6350-6394-10082-9668-7924-56Total1107470941123998166653182-8371-7479-8189-9486-9171-7668-85Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges. -Figure 3: Corpus Conversion Service annotation user interface. The PDF page is shown in the background, with overlaid text-cells (in darker shades). The annotation boxes can be drawn by dragging a rectangle over each segment with the respective label from the palette on the right. -we distributed the annotation workload and performed continuous quality controls. Phase one and two required a small team of experts only. For phases three and four, a group of 40 dedicated annotators were assembled and supervised. -Phase 1: Data selection and preparation. Our inclusion criteria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources include publication repositories such as arXiv 3 , government offices, company websites as well as data directory services for financial reports and patents. Scanned documents were excluded wherever possible because they can be rotated or skewed. This would not allow us to perform annotation with rectangular bounding-boxes and therefore complicate the annotation process. -Preparation work included uploading and parsing the sourced PDF documents in the Corpus Conversion Service (CCS) [22], a cloud-native platform which provides a visual annotation interface and allows for dataset inspection and analysis. The annotation interface of CCS is shown in Figure 3. The desired balance of pages between the different document categories was achieved by selective subsampling of pages with certain desired properties. For example, we made sure to include the title page of each document and bias the remaining page selection to those with figures or tables. The latter was achieved by leveraging pre-trained object detection models from PubLayNet, which helped us estimate how many figures and tables a given page contains. -Phase 2: Label selection and guideline. We reviewed the collected documents and identified the most common structural features they exhibit. This was achieved by identifying recurrent layout elements and lead us to the definition of 11 distinct class labels. These 11 class labels are Caption , Footnote , Formula List-item , , Pagefooter , Page-header , Picture , Section-header , Table , Text , and Title . Critical factors that were considered for the choice of these class labels were (1) the overall occurrence of the label, (2) the specificity of the label, (3) recognisability on a single page (i.e. no need for context from previous or next page) and (4) overall coverage of the page. Specificity ensures that the choice of label is not ambiguous, while coverage ensures that all meaningful items on a page can be annotated. We refrained from class labels that are very specific to a document category, such as Abstract in the Scientific Articles category. We also avoided class labels that are tightly linked to the semantics of the text. Labels such as Author and Affiliation , as seen in DocBank, are often only distinguishable by discriminating on -3 https://arxiv.org/ -DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis -KDD '22, August 14-18, 2022, Washington, DC, USA -the textual content of an element, which goes beyond visual layout recognition, in particular outside the Scientific Articles category. -At first sight, the task of visual document-layout interpretation appears intuitive enough to obtain plausible annotations in most cases. However, during early trial-runs in the core team, we observed many cases in which annotators use different annotation styles, especially for documents with challenging layouts. For example, if a figure is presented with subfigures, one annotator might draw a single figure bounding-box, while another might annotate each subfigure separately. The same applies for lists, where one might annotate all list items in one block or each list item separately. In essence, we observed that challenging layouts would be annotated in different but plausible ways. To illustrate this, we show in Figure 4 multiple examples of plausible but inconsistent annotations on the same pages. -Obviously, this inconsistency in annotations is not desirable for datasets which are intended to be used for model training. To minimise these inconsistencies, we created a detailed annotation guideline. While perfect consistency across 40 annotation staff members is clearly not possible to achieve, we saw a huge improvement in annotation consistency after the introduction of our annotation guideline. A few selected, non-trivial highlights of the guideline are: -(1) Every list-item is an individual object instance with class label List-item . This definition is different from PubLayNet and DocBank, where all list-items are grouped together into one List object. -(2) A List-item is a paragraph with hanging indentation. Singleline elements can qualify as List-item if the neighbour elements expose hanging indentation. Bullet or enumeration symbols are not a requirement. -(3) For every Caption , there must be exactly one corresponding Picture or Table . -(4) Connected sub-pictures are grouped together in one Picture object. -(5) Formula numbers are included in a Formula object. -(6) Emphasised text (e.g. in italic or bold) at the beginning of a paragraph is not considered a Section-header , unless it appears exclusively on its own line. - -The complete annotation guideline is over 100 pages long and a detailed description is obviously out of scope for this paper. Nevertheless, it will be made publicly available alongside with DocLayNet for future reference. -Phase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore we prepared a subset of pages with two different complexity levels, each with a practice and an exam part. 974 pages were reference-annotated by one proficient core team member. Annotation staff were then given the task to annotate the same subsets (blinded from the reference). By comparing the annotations of each staff member with the reference annotations, we could quantify how closely their annotations matched the reference. Only after passing two exam levels with high annotation quality, staff were admitted into the production phase. Practice iterations - -05237a14f2524e3f53c8454b074409d05078038a6a36b770fcc8ec7e540deae0 -Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous. -were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially allocated annotators did not pass the bar. -Phase 4: Production annotation. The previously selected 80K pages were annotated with the defined 11 class labels by 32 annotators. This production phase took around three months to complete. All annotations were created online through CCS, which visualises the programmatic PDF text-cells as an overlay on the page. The page annotation are obtained by drawing rectangular bounding-boxes, as shown in Figure 3. With regard to the annotation practices, we implemented a few constraints and capabilities on the tooling level. First, we only allow non-overlapping, vertically oriented, rectangular boxes. For the large majority of documents, this constraint was sufficient and it speeds up the annotation considerably in comparison with arbitrary segmentation shapes. Second, annotator staff were not able to see each other's annotations. This was enforced by design to avoid any bias in the annotation, which could skew the numbers of the inter-annotator agreement (see Table 1). We wanted -KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar -Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset. -humanMRCNNFRCNNYOLOR50R101R101v5x6Caption84-8968.471.570.177.7Footnote83-9170.971.873.777.2Formula83-8560.163.463.566.2List-item87-8881.280.881.086.2Page-footer93-9461.659.358.961.1Page-header85-8971.970.072.067.9Picture69-7171.772.772.077.1Section-header83-8467.669.368.474.6Table77-8182.282.982.286.3Text84-8684.685.885.488.1Title60-7276.780.479.982.7All82-8372.473.573.476.8 -to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity. -5 EXPERIMENTS -The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this -Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions. -paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work. -In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16]. -Baselines for Object Detection -In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 × 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document. -DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis -KDD '22, August 14-18, 2022, Washington, DC, USA -Table 3: Performance of a Mask R-CNN R50 network in mAP@0.5-0.95 scores trained on DocLayNet with different class label sets. The reduced label sets were obtained by either down-mapping or dropping labels. -Class-count11654Caption68TextTextTextFootnote71TextTextTextFormula60TextTextTextList-item81Text82TextPage-footer6262--Page-header7268--Picture72727272Section-header68676968Table82838282Text85848484Title77Sec.-h.Sec.-h.Sec.-h.Overall72737877 -Learning Curve -One of the fundamental questions related to any dataset is if it is 'large enough'. To answer this question for DocLayNet, we performed a data ablation study in which we evaluated a Mask R-CNN model trained on increasing fractions of the DocLayNet dataset. As can be seen in Figure 5, the mAP score rises sharply in the beginning and eventually levels out. To estimate the error-bar on the metrics, we ran the training five times on the entire data-set. This resulted in a 1% error-bar, depicted by the shaded area in Figure 5. In the inset of Figure 5, we show the exact same data-points, but with a logarithmic scale on the x-axis. As is expected, the mAP score increases linearly as a function of the data-size in the inset. The curve ultimately flattens out between the 80% and 100% mark, with the 80% mark falling within the error-bars of the 100% mark. This provides a good indication that the model would not improve significantly by yet increasing the data size. Rather, it would probably benefit more from improved data consistency (as discussed in Section 3), data augmentation methods [23], or the addition of more document categories and styles. -Impact of Class Labels -The choice and number of labels can have a significant effect on the overall model performance. Since PubLayNet, DocBank and DocLayNet all have different label sets, it is of particular interest to understand and quantify this influence of the label set on the model performance. We investigate this by either down-mapping labels into more common ones (e.g. Caption → Text ) or excluding them from the annotations entirely. Furthermore, it must be stressed that all mappings and exclusions were performed on the data before model training. In Table 3, we present the mAP scores for a Mask R-CNN R50 network on different label sets. Where a label is down-mapped, we show its corresponding label, otherwise it was excluded. We present three different label sets, with 6, 5 and 4 different labels respectively. The set of 5 labels contains the same labels as PubLayNet. However, due to the different definition of -Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise split for different label sets. Naive page-wise split will result in /tildelow 10% point improvement. -Class-count115SplitDocPageDocPageCaption6883Footnote7184Formula6066List-item81888288Page-footer6289Page-header7290Picture72827282Section-header68836983Table82898290Text85918490Title7781All72847887 -lists in PubLayNet (grouped list-items) versus DocLayNet (separate list-items), the label set of size 4 is the closest to PubLayNet, in the assumption that the List is down-mapped to Text in PubLayNet. The results in Table 3 show that the prediction accuracy on the remaining class labels does not change significantly when other classes are merged into them. The overall macro-average improves by around 5%, in particular when Page-footer and Page-header are excluded. -Impact of Document Split in Train and Test Set -Many documents in DocLayNet have a unique styling. In order to avoid overfitting on a particular style, we have split the train-, test- and validation-sets of DocLayNet on document boundaries, i.e. every document contributes pages to only one set. To the best of our knowledge, this was not considered in PubLayNet or DocBank. To quantify how this affects model performance, we trained and evaluated a Mask R-CNN R50 model on a modified dataset version. Here, the train-, test- and validation-sets were obtained by a randomised draw over the individual pages. As can be seen in Table 4, the difference in model performance is surprisingly large: pagewise splitting gains ˜ 0% in mAP over the document-wise splitting. 1 Thus, random page-wise splitting of DocLayNet can easily lead to accidental overestimation of model performance and should be avoided. -Dataset Comparison -Throughout this paper, we claim that DocLayNet's wider variety of document layouts leads to more robust layout detection models. In Table 5, we provide evidence for that. We trained models on each of the available datasets (PubLayNet, DocBank and DocLayNet) and evaluated them on the test sets of the other datasets. Due to the different label sets and annotation styles, a direct comparison is not possible. Hence, we focussed on the common labels among the datasets. Between PubLayNet and DocLayNet, these are Picture , -KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar -Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets. -Testing onTraining onlabelsPLNDBDLNPubLayNet (PLN)Figure964323Sec-header87-32Table952449Text96-42total933430DocBank (DB)Figure777131Table196522total486827DocLayNet (DLN)Figure675172Sec-header53-68Table874382Text77-84total594778 -Section-header , Table and Text . Before training, we either mapped or excluded DocLayNet's other labels as specified in table 3, and also PubLayNet's List to Text . Note that the different clustering of lists (by list-element vs. whole list objects) naturally decreases the mAP score for Text . -For comparison of DocBank with DocLayNet, we trained only on Picture and Table clusters of each dataset. We had to exclude Text because successive paragraphs are often grouped together into a single object in DocBank. This paragraph grouping is incompatible with the individual paragraphs of DocLayNet. As can be seen in Table 5, DocLayNet trained models yield better performance compared to the previous datasets. It is noteworthy that the models trained on PubLayNet and DocBank perform very well on their own test set, but have a much lower performance on the foreign datasets. While this also applies to DocLayNet, the difference is far less pronounced. Thus we conclude that DocLayNet trained models are overall more robust and will produce better results for challenging, unseen layouts. -Example Predictions -To conclude this section, we illustrate the quality of layout predictions one can expect from DocLayNet-trained models by providing a selection of examples without any further post-processing applied. Figure 6 shows selected layout predictions on pages from the test-set of DocLayNet. Results look decent in general across document categories, however one can also observe mistakes such as overlapping clusters of different classes, or entirely missing boxes due to low confidence. -6 CONCLUSION -In this paper, we presented the DocLayNet dataset. It provides the document conversion and layout analysis research community a new and challenging dataset to improve and fine-tune novel ML methods on. In contrast to many other datasets, DocLayNet was created by human annotation in order to obtain reliable layout ground-truth on a wide variety of publication- and typesettingstyles. Including a large proportion of documents outside the scientific publishing domain adds significant value in this respect. -From the dataset, we have derived on the one hand reference metrics for human performance on document-layout annotation (through double and triple annotations) and on the other hand evaluated the baseline performance of commonly used object detection methods. We also illustrated the impact of various dataset-related aspects on model performance through data-ablation experiments, both from a size and class-label perspective. Last but not least, we compared the accuracy of models trained on other public datasets and showed that DocLayNet trained models are more robust. -To date, there is still a significant gap between human and ML accuracy on the layout interpretation task, and we hope that this work will inspire the research community to close that gap. -REFERENCES -[1] Max Göbel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. -[2] Christian Clausner, Apostolos Antonacopoulos, and Stefan Pletschacher. Icdar2017 competition on recognition of documents with complex layouts rdcl2017. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 1404-1410, 2017. -[3] Hervé Déjean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), April 2019. http://sac.founderit.com/. -[4] Antonio Jimeno Yepes, Peter Zhong, and Douglas Burdick. Competition on scientific literature parsing. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 605-617. LNCS 12824, SpringerVerlag, sep 2021. -[5] Logan Markewich, Hao Zhang, Yubin Xing, Navid Lambert-Shirzad, Jiang Zhexin, Roy Lee, Zhi Li, and Seok-Bum Ko. Segmentation for document layout analysis: not dead yet. International Journal on Document Analysis and Recognition (IJDAR) , pages 1-11, 01 2022. -[6] Xu Zhong, Jianbin Tang, and Antonio Jimeno-Yepes. Publaynet: Largest dataset ever for document layout analysis. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 1015-1022, sep 2019. -[7] Minghao Li, Yiheng Xu, Lei Cui, Shaohan Huang, Furu Wei, Zhoujun Li, and Ming Zhou. Docbank: A benchmark dataset for document layout analysis. In Proceedings of the 28th International Conference on Computational Linguistics , COLING, pages 949-960. International Committee on Computational Linguistics, dec 2020. -[8] Riaz Ahmad, Muhammad Tanvir Afzal, and M. Qadir. Information extraction from pdf sources based on rule-based system using integrated formats. In SemWebEval@ESWC , 2016. -[9] Ross B. Girshick, Jeff Donahue, Trevor Darrell, and Jitendra Malik. Rich feature hierarchies for accurate object detection and semantic segmentation. In IEEE Conference on Computer Vision and Pattern Recognition , CVPR, pages 580-587. IEEE Computer Society, jun 2014. -[10] Ross B. Girshick. Fast R-CNN. In 2015 IEEE International Conference on Computer Vision , ICCV, pages 1440-1448. IEEE Computer Society, dec 2015. -[11] Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. Faster r-cnn: Towards real-time object detection with region proposal networks. IEEE Transactions on Pattern Analysis and Machine Intelligence , 39(6):1137-1149, 2017. -[12] Kaiming He, Georgia Gkioxari, Piotr Dollár, and Ross B. Girshick. Mask R-CNN. In IEEE International Conference on Computer Vision , ICCV, pages 2980-2988. IEEE Computer Society, Oct 2017. -[13] Glenn Jocher, Alex Stoken, Ayush Chaurasia, Jirka Borovec, NanoCode012, TaoXie, Yonghye Kwon, Kalen Michael, Liu Changyu, Jiacong Fang, Abhiram V, Laughing, tkianai, yxNONG, Piotr Skalski, Adam Hogan, Jebastin Nadar, imyhxy, Lorenzo Mammana, Alex Wang, Cristi Fati, Diego Montes, Jan Hajek, Laurentiu - -DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis -KDD '22, August 14-18, 2022, Washington, DC, USA -Text Caption List-Item Formula Table Section-Header Picture Page-Header Page-Footer Title -Figure 6: Example layout predictions on selected pages from the DocLayNet test-set. (A, D) exhibit favourable results on coloured backgrounds. (B, C) show accurate list-item and paragraph differentiation despite densely-spaced lines. (E) demonstrates good table and figure distinction. (F) shows predictions on a Chinese patent with multiple overlaps, label confusion and missing boxes. -Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang. ultralytics/yolov5: v6.0 - yolov5n nano models, roboflow integration, tensorflow export, opencv dnn support, October 2021. -[20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021. -[14] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers. CoRR , abs/2005.12872, 2020. -[15] Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and efficient object detection. CoRR , abs/1911.09070, 2019. -[16] Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Dollár, and C. Lawrence Zitnick. Microsoft COCO: common objects in context, 2014. -[17] Yuxin Wu, Alexander Kirillov, Francisco Massa, Wan-Yen Lo, and Ross Girshick. Detectron2, 2019. -[18] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter W. J. Staar. Robust pdf document conversion using recurrent neural networks. In Proceedings of the 35th Conference on Artificial Intelligence , AAAI, pages 1513715145, feb 2021. -[19] Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou. Layoutlm: Pre-training of text and layout for document image understanding. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 1192-1200, New York, USA, 2020. Association for Computing Machinery. -[21] Peng Zhang, Can Li, Liang Qiao, Zhanzhan Cheng, Shiliang Pu, Yi Niu, and Fei Wu. Vsr: A unified framework for document layout analysis combining vision, semantics and relations, 2021. -[22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 774-782. ACM, 2018. -[23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation for deep learning. Journal of Big Data , 6(1):60, 2019. - diff --git a/test/data/doc/2408.09869v3_enriched.gt.md b/test/data/doc/2408.09869v3_enriched.gt.md index f916b9c1..235d36c3 100644 --- a/test/data/doc/2408.09869v3_enriched.gt.md +++ b/test/data/doc/2408.09869v3_enriched.gt.md @@ -1,64 +1,7 @@ -# Docling Technical Report - -In this image we can see a cartoon image of a duck holding a paper. - In this image we can see a cartoon image of a duck holding a paper. - - -Version 1.0 - -Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar - -AI4K Group, IBM Research R¨ uschlikon, Switzerland - -## Abstract - -This technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source package for PDF document conversion. It is powered by state-of-the-art specialized AI models for layout analysis (DocLayNet) and table structure recognition (TableFormer), and runs efficiently on commodity hardware in a small resource budget. The code interface allows for easy extensibility and addition of new features and models. - -## 1 Introduction - -Converting PDF documents back into a machine-processable format has been a major challenge for decades due to their huge variability in formats, weak standardization and printing-optimized characteristic, which discards most structural features and metadata. With the advent of LLMs and popular application patterns such as retrieval-augmented generation (RAG), leveraging the rich content embedded in PDFs has become ever more relevant. In the past decade, several powerful document understanding solutions have emerged on the market, most of which are commercial software, cloud offerings [3] and most recently, multi-modal vision-language models. As of today, only a handful of open-source tools cover PDF conversion, leaving a significant feature and quality gap to proprietary solutions. - -With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models. - -Here is what Docling delivers today: - -- Converts PDF documents to JSON or Markdown format, stable and lightning fast -- Understands detailed page layout, reading order, locates figures and recovers table structures -- Extracts metadata from the document, such as title, authors, references and language -- Optionally applies OCR, e.g. for scanned PDFs -- Can be configured to be optimal for batch-mode (i.e high throughput, low time-to-solution) or interactive mode (compromise on efficiency, low time-to-solution) -- Can leverage different accelerators (GPU, MPS, etc). - -## 2 Getting Started - -To use Docling, you can simply install the docling package from PyPI. Documentation and examples are available in our GitHub repository at [github.com/DS4SD/docling](https://github.com/DS4SD/docling) . All required model assets 1 are downloaded to a local huggingface datasets cache on first use, unless you choose to pre-install the model assets in advance. - -Docling provides an easy code interface to convert PDF documents from file system, URLs or binary streams, and retrieve the output in either JSON or Markdown format. For convenience, separate methods are offered to convert single documents or batches of documents. A basic usage example is illustrated below. Further examples are available in the Doclign code repository. - -``` -from docling.document_converter import DocumentConverter Large -``` - -``` -source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL converter = DocumentConverter() result = converter.convert_single(source) print(result.render_as_markdown()) # output: "## DocLayNet: A Human -Annotated Dataset for Document -Layout Analysis [...]" -``` - -Optionally, you can configure custom pipeline features and runtime options, such as turning on or off features (e.g. OCR, table structure recognition), enforcing limits on the input document size, and defining the budget of CPU threads. Advanced usage examples and options are documented in the README file. Docling also provides a Dockerfile to demonstrate how to install and run it inside a container. - -## 3 Processing pipeline - -Docling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieves the programmatic text tokens, consisting of string content and its coordinates on the page, and also renders a bitmap image of each page to support downstream operations. Then, the standard model pipeline applies a sequence of AI models independently on every page in the document to extract features and content, such as layout and table structures. Finally, the results from all pages are aggregated and passed through a post-processing stage, which augments metadata, detects the document language, infers reading-order and eventually assembles a typed document object which can be serialized to JSON or Markdown. - -## 3.1 PDF backends - -Two basic requirements to process PDF documents in our pipeline are a) to retrieve all text content and their geometric coordinates on each page and b) to render the visual representation of each page as it would appear in a PDF viewer. Both these requirements are encapsulated in Docling's PDF backend interface. While there are several open-source PDF parsing libraries available for python, we faced major obstacles with all of them for different reasons, among which were restrictive - -1 see huggingface.co/ds4sd/docling-models/ - In this image, we can see some text and images. @@ -117,33 +60,8 @@ Establishing GPU acceleration support for the AI models is currently work-in-pro -torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report. - {'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} -Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads. - -| CPU | Thread budget | native backend | native backend | native backend | pypdfium backend | pypdfium backend | pypdfium backend | -|----------------------------------|-----------------|------------------|------------------|------------------|--------------------|--------------------|--------------------| -| | | TTS | Pages/s | Mem | TTS | Pages/s | Mem | -| Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB | -| (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB | - -## 5 Applications - -Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets. - -## 6 Future work and contributions - -Docling is designed to allow easy extension of the model library and pipelines. In the future, we plan to extend Docling with several more models, such as a figure-classifier model, an equationrecognition model, a code-recognition model and more. This will help improve the quality of conversion for specific types of content, as well as augment extracted document metadata with additional information. Further investment into testing and optimizing GPU acceleration as well as improving the Docling-native PDF backend are on our roadmap, too. - -We encourage everyone to propose or implement additional features and models, and will gladly take your inputs and contributions under review . The codebase of Docling is open for use and contribution, under the MIT license agreement and in alignment with our contributing guidelines included in the Docling repository. If you use Docling in your projects, please consider citing this technical report. - -## References - -- [1] J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/ JaidedAI/EasyOCR , 2024. Version: 1.7.0. -- [2] J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. Voznesensky, B. Bao, P. Bell, D. Berard, E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison, W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. Lazos, M. Lezcano, Y. Liang, J. Liang, Y. Lu, C. Luk, B. Maher, Y. Pan, C. Puhrsch, M. Reso, M. Saroufim, M. Y. Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang, X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S. Chintala. Pytorch 2: Faster - machine learning through dynamic python bytecode transformation and graph compilation. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2 (ASPLOS '24) . ACM, 4 2024. doi: 10.1145/3620665.3640366. URL https://pytorch.org/assets/pytorch2-2.pdf . @@ -165,146 +83,18 @@ machine learning through dynamic python bytecode transformation and graph compil -## Appendix - -In this section, we illustrate a few examples of Docling's output in Markdown and JSON. - -## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis - -## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis - -Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com - -Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com - -Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com - -Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com - -Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com - -## ABSTRACT - -Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large ground-truth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis. - -## CCS CONCEPTS - -· Informationsystems → Documentstructure ; · Appliedcomputing → Document analysis ; · Computing methodologies → Machine learning Computer vision ; ; Object detection ; - -Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s). KDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043 - -Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com - -Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com - -Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com - -Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com - -Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com - -## ABSTRACT - -Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large groundtruth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis. - -## CCS CONCEPTS - -Æ Information systems → Document structure ; Æ Applied computing → Document analysis ; Æ Computing methodologies → Machine learning ; Computer vision ; Object detection ; - -Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s). - -KDD '22, August 14-18, 2022, Washington, DC, USA ' 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043 - -Figure 1: Four examples of complex page layouts across different document categories - -## KEYWORDS - -PDF document conversion, layout segmentation, object-detection, data set, Machine Learning - -## ACM Reference Format: - -Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043 - In this image there is a table with some text on it. -In this image there is a table with some text on it. - - - -In this image we can see a text. - In this image we can see a text. - - -AGL Energy Limited ABN 74 1 - -5 061 375 - -In this image I can see the cover of the book. - In this image I can see the cover of the book. - - In this image there is a paper with some text on it. -In this image there is a paper with some text on it. - - - -Figure 1: Four examples of complex page layouts across different document categories - -## KEYWORDS - -PDF document conversion, layout segmentation, object-detection, data set, Machine Learning - -## ACMReference Format: - -Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043 - -1 INTRODUCTION - -Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1. Figure 2: Title page of the DocLayNet paper (arxiv.org/pdf/2206.01062) - left PDF, right rendered Markdown. If recognized, metadata such as authors are appearing first under the title. Text content inside figures is currently dropped, the caption is retained and linked to the figure in the JSON representation (not shown). - -KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar - -Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset. - -| | human | MRCNN R50 R101 | FRCNN R101 | YOLO v5x6 | -|--------------------------------------------------------------------------------------------------------|-------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------|--------------------------------------------------------| -| Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All | 84-89 83-91 83-85 87-88 93-94 85-89 69-71 83-84 77-81 84-86 | 68.4 71.5 70.9 60.1 63.4 81.2 80.8 61.6 59.3 71.9 70.0 71.7 72.7 67.6 69.3 82.2 82.9 85.8 76.7 80.4 72.4 73.5 | 70.1 73.7 63.5 81.0 58.9 72.0 72.0 68.4 82.2 85.4 79.9 73.4 | 77.7 77.2 66.2 86.2 61.1 67.9 74.6 86.3 88.1 82.7 76.8 | - -to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity. - -## 5 EXPERIMENTS - -The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this - In this image, we can see a table with some text. -In this image, we can see a table with some text. - - - -Third, achienec - -## EXPERIMENTS - -chalenongayouls ground-vuth dawa such WC - -The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. - -The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. - -### Analysis: - -#### Training Program: -- **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. -- **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%. - The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. @@ -315,156 +105,18 @@ The graph has two lines: one for the training program and one for the percentage - **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. - **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%. - - -Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions. - -paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work. - -In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16]. - -## Baselines for Object Detection - -In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 × 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document. - -coioct dcochon modols - -## Baselines for Object Detection - -mak enbrel - -Figure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table. - -KDD '22, August 14-18, 2022, Washington, DC, USA - -Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar - -Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % - -between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges. - -of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric - The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A" -The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. - -The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. - -The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A" - - - -In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318. - In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318. - - -| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | -|----------------|---------|--------------|--------------|--------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------| -| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten | -| Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a | -| Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 | -| Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a | -| List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 | -| Page-footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 | -| Page-header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 | -| Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 | -| Section-header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 | -| Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 | -| Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 | -| Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 | -| Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 | - In this image I can see a blue circle. -In this image I can see a blue circle. - - - -include publication repositories such as arXiv - -Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple- - -annotated pages, from which we obtain accuracy ranges. - -A table with different columns and rows. - A table with different columns and rows. - - -| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | -|-----------------|---------|--------------|--------------|--------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------| -| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten | -| Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a | -| Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 | -| Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a | -| List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 | -| Page- footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 | -| Page- header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 | -| Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 | -| Section- header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 | -| Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 | -| Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 | -| Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 | -| Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 | - -3 - -, - -government offices, - -We reviewed the col- - -, - -Page- - -Title and - -. - -page. Specificity ensures that the choice of label is not ambiguous, - -In this image there is a text in the middle. - In this image there is a text in the middle. - - - -we distributed the annotation workload and performed continuous be annotated. We refrained from class labels that are very specific - -only. For phases three and four, a group of 40 dedicated annotators while coverage ensures that all meaningful items on a page can - -quality controls. Phase one and two required a small team of experts to a document category, such as - -Abstract in the - -Scientific Articles were assembled and supervised. - -category. We also avoided class labels that are tightly linked to the - -Phase 1: Data selection and preparation. - -Our inclusion cri- - -Author - -Affiliation - -teria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources in DocBank, are often only distinguishable by discriminating on 3 https://arxiv.org/ Figure 4: Table 1 from the DocLayNet paper in the original PDF (A), as rendered Markdown (B) and in JSON representation (C). Spanning table cells, such as the multi-column header 'triple interannotator mAP@0.5-0.95 (%)', is repeated for each column in the Markdown representation (B), which guarantees that every data point can be traced back to row and column headings only by its grid coordinates in the table. In the JSON representation, the span information is reflected in the fields of each table cell (C). - -semantics of the text. Labels such as and - -, - -as seen diff --git a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.html b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.html index 96c6750b..3e166869 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.html +++ b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.html @@ -12,168 +12,6 @@

Abstract

1 Introduction

Converting PDF documents back into a machine-processable format has been a major challenge for decades due to their huge variability in formats, weak standardization and printing-optimized characteristic, which discards most structural features and metadata. With the advent of LLMs and popular application patterns such as retrieval-augmented generation (RAG), leveraging the rich content embedded in PDFs has become ever more relevant. In the past decade, several powerful document understanding solutions have emerged on the market, most of which are commercial software, cloud offerings [3] and most recently, multi-modal vision-language models. As of today, only a handful of open-source tools cover PDF conversion, leaving a significant feature and quality gap to proprietary solutions.

With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models.

-

Here is what Docling delivers today:

-
    -
  • Converts PDF documents to JSON or Markdown format, stable and lightning fast
  • -
  • Understands detailed page layout, reading order, locates figures and recovers table structures
  • -
  • Extracts metadata from the document, such as title, authors, references and language
  • -
  • Optionally applies OCR, e.g. for scanned PDFs
  • -
  • Can be configured to be optimal for batch-mode (i.e high throughput, low time-to-solution) or interactive mode (compromise on efficiency, low time-to-solution)
  • -
  • Can leverage different accelerators (GPU, MPS, etc).
  • -
-

2 Getting Started

-To use Docling, you can simply install the docling package from PyPI. Documentation and examples are available in our GitHub repository at github.com/DS4SD/docling . All required model assets 1 are downloaded to a local huggingface datasets cache on first use, unless you choose to pre-install the model assets in advance. -

Docling provides an easy code interface to convert PDF documents from file system, URLs or binary streams, and retrieve the output in either JSON or Markdown format. For convenience, separate methods are offered to convert single documents or batches of documents. A basic usage example is illustrated below. Further examples are available in the Doclign code repository.

-
from docling.document_converter import DocumentConverter Large
-
source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL converter = DocumentConverter() result = converter.convert_single(source) print(result.render_as_markdown()) # output: "## DocLayNet: A Human -Annotated Dataset for Document -Layout Analysis [...]"
-

Optionally, you can configure custom pipeline features and runtime options, such as turning on or off features (e.g. OCR, table structure recognition), enforcing limits on the input document size, and defining the budget of CPU threads. Advanced usage examples and options are documented in the README file. Docling also provides a Dockerfile to demonstrate how to install and run it inside a container.

-

3 Processing pipeline

-

Docling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieves the programmatic text tokens, consisting of string content and its coordinates on the page, and also renders a bitmap image of each page to support downstream operations. Then, the standard model pipeline applies a sequence of AI models independently on every page in the document to extract features and content, such as layout and table structures. Finally, the results from all pages are aggregated and passed through a post-processing stage, which augments metadata, detects the document language, infers reading-order and eventually assembles a typed document object which can be serialized to JSON or Markdown.

-

3.1 PDF backends

-

Two basic requirements to process PDF documents in our pipeline are a) to retrieve all text content and their geometric coordinates on each page and b) to render the visual representation of each page as it would appear in a PDF viewer. Both these requirements are encapsulated in Docling's PDF backend interface. While there are several open-source PDF parsing libraries available for python, we faced major obstacles with all of them for different reasons, among which were restrictive

-

1 see huggingface.co/ds4sd/docling-models/

-
Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.
-

licensing (e.g. pymupdf [7]), poor speed or unrecoverable quality issues, such as merged text cells across far-apart text tokens or table columns (pypdfium, PyPDF) [15, 14].

-

We therefore decided to provide multiple backend choices, and additionally open-source a custombuilt PDF parser, which is based on the low-level qpdf [4] library. It is made available in a separate package named docling-parse and powers the default PDF backend in Docling. As an alternative, we provide a PDF backend relying on pypdfium , which may be a safe backup choice in certain cases, e.g. if issues are seen with particular font encodings.

-

3.2 AI models

-

As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks.

-

Layout Analysis Model

-

Our layout analysis model is an object-detector which predicts the bounding-boxes and classes of various elements on the image of a given page. Its architecture is derived from RT-DETR [16] and re-trained on DocLayNet [13], our popular human-annotated dataset for document-layout analysis, among other proprietary datasets. For inference, our implementation relies on the onnxruntime [5].

-

The Docling pipeline feeds page images at 72 dpi resolution, which can be processed on a single CPU with sub-second latency. All predicted bounding-box proposals for document elements are post-processed to remove overlapping proposals based on confidence and size, and then intersected with the text tokens in the PDF to group them into meaningful and complete units such as paragraphs, section titles, list items, captions, figures or tables.

-

Table Structure Recognition

-

The TableFormer model [12], first published in 2022 and since refined with a custom structure token language [9], is a vision-transformer model for table structure recovery. It can predict the logical row and column structure of a given table based on an input image, and determine which table cells belong to column headers, row headers or the table body. Compared to earlier approaches, TableFormer handles many characteristics of tables, such as partial or no borderlines, empty cells, rows or columns, cell spans and hierarchy both on column-heading or row-heading level, tables with inconsistent indentation or alignment and other complexities. For inference, our implementation relies on PyTorch [2].

-

The Docling pipeline feeds all table objects detected in the layout analysis to the TableFormer model, by providing an image-crop of the table and the included text cells. TableFormer structure predictions are matched back to the PDF cells in post-processing to avoid expensive re-transcription text in the table image. Typical tables require between 2 and 6 seconds to be processed on a standard CPU, strongly depending on the amount of included table cells.

-

OCR

-

Docling provides optional support for OCR, for example to cover scanned PDFs or content in bitmaps images embedded on a page. In our initial release, we rely on EasyOCR [1], a popular thirdparty OCR library with support for many languages. Docling, by default, feeds a high-resolution page image (216 dpi) to the OCR engine, to allow capturing small print detail in decent quality. While EasyOCR delivers reasonable transcription quality, we observe that it runs fairly slow on CPU (upwards of 30 seconds per page).

-

We are actively seeking collaboration from the open-source community to extend Docling with additional OCR backends and speed improvements.

-

3.3 Assembly

-

In the final pipeline stage, Docling assembles all prediction results produced on each page into a well-defined datatype that encapsulates a converted document, as defined in the auxiliary package docling-core . The generated document object is passed through a post-processing model which leverages several algorithms to augment features, such as detection of the document language, correcting the reading order, matching figures with captions and labelling metadata such as title, authors and references. The final output can then be serialized to JSON or transformed into a Markdown representation at the users request.

-

3.4 Extensibility

-

Docling provides a straight-forward interface to extend its capabilities, namely the model pipeline. A model pipeline constitutes the central part in the processing, following initial document parsing and preceding output assembly, and can be fully customized by sub-classing from an abstract baseclass ( BaseModelPipeline ) or cloning the default model pipeline. This effectively allows to fully customize the chain of models, add or replace models, and introduce additional pipeline configuration parameters. To use a custom model pipeline, the custom pipeline class to instantiate can be provided as an argument to the main document conversion methods. We invite everyone in the community to propose additional or alternative models and improvements.

-

Implementations of model classes must satisfy the python Callable interface. The __call__ method must accept an iterator over page objects, and produce another iterator over the page objects which were augmented with the additional features predicted by the model, by extending the provided PagePredictions data model accordingly.

-

4 Performance

-

In this section, we establish some reference numbers for the processing speed of Docling and the resource budget it requires. All tests in this section are run with default options on our standard test set distributed with Docling, which consists of three papers from arXiv and two IBM Redbooks, with a total of 225 pages. Measurements were taken using both available PDF backends on two different hardware systems: one MacBook Pro M3 Max, and one bare-metal server running Ubuntu 20.04 LTS on an Intel Xeon E5-2690 CPU. For reproducibility, we fixed the thread budget (through setting OMP NUM THREADS environment variable ) once to 4 (Docling default) and once to 16 (equal to full core count on the test hardware). All results are shown in Table 1.

-

If you need to run Docling in very low-resource environments, please consider configuring the pypdfium backend. While it is faster and more memory efficient than the default docling-parse backend, it will come at the expense of worse quality results, especially in table structure recovery.

-

Establishing GPU acceleration support for the AI models is currently work-in-progress and largely untested, but may work implicitly when CUDA is available and discovered by the onnxruntime and

-

torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report.

-
Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads.
CPUThread budgetnative backendpypdfium backend
TTSPages/sMemTTSPages/sMem
Apple M3 Max4177 s 167 s1.27 1.346.20 GB103 s 92 s2.18 2.452.56 GB
(16 cores) Intel(R) Xeon E5-269016 4 16375 s 244 s0.60 0.926.16 GB239 s 143 s0.94 1.572.42 GB
-

5 Applications

-

Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets.

-

6 Future work and contributions

-

Docling is designed to allow easy extension of the model library and pipelines. In the future, we plan to extend Docling with several more models, such as a figure-classifier model, an equationrecognition model, a code-recognition model and more. This will help improve the quality of conversion for specific types of content, as well as augment extracted document metadata with additional information. Further investment into testing and optimizing GPU acceleration as well as improving the Docling-native PDF backend are on our roadmap, too.

-

We encourage everyone to propose or implement additional features and models, and will gladly take your inputs and contributions under review . The codebase of Docling is open for use and contribution, under the MIT license agreement and in alignment with our contributing guidelines included in the Docling repository. If you use Docling in your projects, please consider citing this technical report.

-

References

-
    -
  1. J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/ JaidedAI/EasyOCR , 2024. Version: 1.7.0.
  2. -
  3. J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. Voznesensky, B. Bao, P. Bell, D. Berard, E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison, W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. Lazos, M. Lezcano, Y. Liang, J. Liang, Y. Lu, C. Luk, B. Maher, Y. Pan, C. Puhrsch, M. Reso, M. Saroufim, M. Y. Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang, X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S. Chintala. Pytorch 2: Faster
  4. -
-

machine learning through dynamic python bytecode transformation and graph compilation. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2 (ASPLOS '24) . ACM, 4 2024. doi: 10.1145/3620665.3640366. URL https://pytorch.org/assets/pytorch2-2.pdf .

-
    -
  1. C. Auer, M. Dolfi, A. Carvalho, C. B. Ramis, and P. W. Staar. Delivering document conversion as a cloud service with high throughput and responsiveness. In 2022 IEEE 15th International Conference on Cloud Computing (CLOUD) , pages 363-373. IEEE, 2022.
  2. -
  3. J. Berkenbilt. Qpdf: A content-preserving pdf document transformer, 2024. URL https: //github.com/qpdf/qpdf .
  4. -
  5. O. R. developers. Onnx runtime. https://onnxruntime.ai/ , 2024. Version: 1.18.1.
  6. -
  7. IBM. Data Prep Kit: a community project to democratize and accelerate unstructured data preparation for LLM app developers, 2024. URL https://github.com/IBM/ data-prep-kit .
  8. -
  9. A. S. Inc. PyMuPDF, 2024. URL https://github.com/pymupdf/PyMuPDF .
  10. -
  11. J. Liu. LlamaIndex, 11 2022. URL https://github.com/jerryjliu/llama_index .
  12. -
  13. M. Lysak, A. Nassar, N. Livathinos, C. Auer, and P. Staar. Optimized Table Tokenization for Table Structure Recognition. In Document Analysis and Recognition - ICDAR 2023: 17th International Conference, San Jos´ e, CA, USA, August 21-26, 2023, Proceedings, Part II , pages 37-50, Berlin, Heidelberg, Aug. 2023. Springer-Verlag. ISBN 978-3-031-41678-1. doi: 10. 1007/978-3-031-41679-8 3. URL https://doi.org/10.1007/978-3-031-41679-8_3 .
  14. -
  15. L. Mishra, S. Dhibi, Y. Kim, C. Berrospi Ramis, S. Gupta, M. Dolfi, and P. Staar. Statements: Universal information extraction from tables with large language models for ESG KPIs. In D. Stammbach, J. Ni, T. Schimanski, K. Dutia, A. Singh, J. Bingler, C. Christiaen, N. Kushwaha, V. Muccione, S. A. Vaghefi, and M. Leippold, editors, Proceedings of the 1st Workshop on Natural Language Processing Meets Climate Change (ClimateNLP 2024) , pages 193-214, Bangkok, Thailand, Aug. 2024. Association for Computational Linguistics. URL https://aclanthology.org/2024.climatenlp-1.15 .
  16. -
  17. L. Morin, V. Weber, G. I. Meijer, F. Yu, and P. W. J. Staar. Patcid: an open-access dataset of chemical structures in patent documents. Nature Communications , 15(1):6532, August 2024. ISSN 2041-1723. doi: 10.1038/s41467-024-50779-y. URL https://doi.org/10.1038/ s41467-024-50779-y .
  18. -
  19. A. Nassar, N. Livathinos, M. Lysak, and P. Staar. Tableformer: Table structure understanding with transformers. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition , pages 4614-4623, 2022.
  20. -
  21. B. Pfitzmann, C. Auer, M. Dolfi, A. S. Nassar, and P. Staar. Doclaynet: a large humanannotated dataset for document-layout segmentation. pages 3743-3751, 2022.
  22. -
  23. pypdf Maintainers. pypdf: A Pure-Python PDF Library, 2024. URL https://github.com/ py-pdf/pypdf .
  24. -
  25. P. Team. PyPDFium2: Python bindings for PDFium, 2024. URL https://github.com/ pypdfium2-team/pypdfium2 .
  26. -
  27. Y. Zhao, W. Lv, S. Xu, J. Wei, G. Wang, Q. Dang, Y. Liu, and J. Chen. Detrs beat yolos on real-time object detection, 2023.
  28. -
-

Appendix

-

In this section, we illustrate a few examples of Docling's output in Markdown and JSON.

-

DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis

-

DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis

-

Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com

-

Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com

-

Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com

-

Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com

-

Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com

-

ABSTRACT

-

Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large ground-truth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis.

-

CCS CONCEPTS

-

· Informationsystems → Documentstructure ; · Appliedcomputing → Document analysis ; · Computing methodologies → Machine learning Computer vision ; ; Object detection ;

-

Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s). KDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043

-

Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com

-

Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com

-

Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com

-

Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com

-

Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com

-

ABSTRACT

-

Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large groundtruth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis.

-

CCS CONCEPTS

-

Æ Information systems → Document structure ; Æ Applied computing → Document analysis ; Æ Computing methodologies → Machine learning ; Computer vision ; Object detection ;

-

Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s).

-

KDD '22, August 14-18, 2022, Washington, DC, USA ' 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043

-

Figure 1: Four examples of complex page layouts across different document categories

-

KEYWORDS

-

PDF document conversion, layout segmentation, object-detection, data set, Machine Learning

-

ACM Reference Format:

-

Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043

-

AGL Energy Limited ABN 74 1

-

5 061 375

-

Figure 1: Four examples of complex page layouts across different document categories

-

KEYWORDS

-

PDF document conversion, layout segmentation, object-detection, data set, Machine Learning

-

ACMReference Format:

-

Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043

-

1 INTRODUCTION

-

Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1. Figure 2: Title page of the DocLayNet paper (arxiv.org/pdf/2206.01062) - left PDF, right rendered Markdown. If recognized, metadata such as authors are appearing first under the title. Text content inside figures is currently dropped, the caption is retained and linked to the figure in the JSON representation (not shown).

-

KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar

-

Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.

-
humanMRCNN R50 R101FRCNN R101YOLO v5x6
Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All84-89 83-91 83-85 87-88 93-94 85-89 69-71 83-84 77-81 84-8668.4 71.5 70.9 60.1 63.4 81.2 80.8 61.6 59.3 71.9 70.0 71.7 72.7 67.6 69.3 82.2 82.9 85.8 76.7 80.4 72.4 73.570.1 73.7 63.5 81.0 58.9 72.0 72.0 68.4 82.2 85.4 79.9 73.477.7 77.2 66.2 86.2 61.1 67.9 74.6 86.3 88.1 82.7 76.8
-

to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity.

-

5 EXPERIMENTS

-

The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this

-

Third, achienec

-

EXPERIMENTS

-

chalenongayouls ground-vuth dawa such WC

-

Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.

-

paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.

-

In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16].

-

Baselines for Object Detection

-

In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 × 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document.

-

coioct dcochon modols

-

Baselines for Object Detection

-

mak enbrel

-

Figure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table.

-

KDD '22, August 14-18, 2022, Washington, DC, USA

-

Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar

-

Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as %

-

between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.

-

of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric

-
class labelCount% of Totaltriple inter-annotator mAP @ 0.5-0.95 (%)
TrainTestValAllFinManSciLawPatTen
Caption225242.041.772.3284-8940-6186-9294-9995-9969-78n/a
Footnote63180.600.310.5883-91n/a10062-8885-94n/a82-97
Formula250272.251.902.9683-85n/an/a84-8786-96n/an/a
List-item18566017.1913.3415.8287-8874-8390-9297-9781-8575-8893-95
Page-footer708786.515.586.0093-9488-9095-9610092-9710096-98
Page-header580225.106.705.0685-8966-7690-9498-10091-9297-9981-86
Picture459764.212.785.3169-7156-5982-8669-8280-9566-7159-76
Section-header14288412.6015.7712.8583-8476-8190-9294-9587-9469-7378-86
Table347333.202.273.6077-8175-8083-8698-9958-8079-8470-85
Text51037745.8249.2845.0084-8681-8688-9389-9387-9271-7987-95
Title50710.470.300.5060-7224-6350-6394-10082-9668-7924-56
Total1107470941123998166653182-8371-7479-8189-9486-9171-7668-85
-

include publication repositories such as arXiv

-

Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-

-

annotated pages, from which we obtain accuracy ranges.

-
% of Total% of Total% of Totaltriple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)
class labelCountTrainTestValAllFinManSciLawPatTen
Caption225242.041.772.3284-8940-6186-9294-9995-9969-78n/a
Footnote63180.600.310.5883-91n/a10062-8885-94n/a82-97
Formula250272.251.902.9683-85n/an/a84-8786-96n/an/a
List-item18566017.1913.3415.8287-8874-8390-9297-9781-8575-8893-95
Page- footer708786.515.586.0093-9488-9095-9610092-9710096-98
Page- header580225.106.705.0685-8966-7690-9498-10091-9297-9981-86
Picture459764.212.785.3169-7156-5982-8669-8280-9566-7159-76
Section- header14288412.6015.7712.8583-8476-8190-9294-9587-9469-7378-86
Table347333.202.273.6077-8175-8083-8698-9958-8079-8470-85
Text51037745.8249.2845.0084-8681-8688-9389-9387-9271-7987-95
Title50710.470.300.5060-7224-6350-6394-10082-9668-7924-56
Total1107470941123998166653182-8371-7479-8189-9486-9171-7668-85
-

3

-

,

-

government offices,

-

We reviewed the col-

-

,

-

Page-

-

Title and

-

.

-

page. Specificity ensures that the choice of label is not ambiguous,

-

we distributed the annotation workload and performed continuous be annotated. We refrained from class labels that are very specific

-

only. For phases three and four, a group of 40 dedicated annotators while coverage ensures that all meaningful items on a page can

-

quality controls. Phase one and two required a small team of experts to a document category, such as

-

Abstract in the

-

Scientific Articles were assembled and supervised.

-

category. We also avoided class labels that are tightly linked to the

-

Phase 1: Data selection and preparation.

-

Our inclusion cri-

-

Author

-

Affiliation

-

teria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources in DocBank, are often only distinguishable by discriminating on 3 https://arxiv.org/ Figure 4: Table 1 from the DocLayNet paper in the original PDF (A), as rendered Markdown (B) and in JSON representation (C). Spanning table cells, such as the multi-column header 'triple interannotator mAP@0.5-0.95 (%)', is repeated for each column in the Markdown representation (B), which guarantees that every data point can be traced back to row and column headings only by its grid coordinates in the table. In the JSON representation, the span information is reflected in the fields of each table cell (C).

-

semantics of the text. Labels such as and

-

,

-

as seen

diff --git a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.md b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.md index 6adef91e..6f2233e8 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.md +++ b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.md @@ -20,91 +20,8 @@ Converting PDF documents back into a machine-processable format has been a major With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models. -Here is what Docling delivers today: - -- Converts PDF documents to JSON or Markdown format, stable and lightning fast -- Understands detailed page layout, reading order, locates figures and recovers table structures -- Extracts metadata from the document, such as title, authors, references and language -- Optionally applies OCR, e.g. for scanned PDFs -- Can be configured to be optimal for batch-mode (i.e high throughput, low time-to-solution) or interactive mode (compromise on efficiency, low time-to-solution) -- Can leverage different accelerators (GPU, MPS, etc). - -## 2 Getting Started - -To use Docling, you can simply install the docling package from PyPI. Documentation and examples are available in our GitHub repository at [github.com/DS4SD/docling](https://github.com/DS4SD/docling) . All required model assets 1 are downloaded to a local huggingface datasets cache on first use, unless you choose to pre-install the model assets in advance. - -Docling provides an easy code interface to convert PDF documents from file system, URLs or binary streams, and retrieve the output in either JSON or Markdown format. For convenience, separate methods are offered to convert single documents or batches of documents. A basic usage example is illustrated below. Further examples are available in the Doclign code repository. - -``` -from docling.document_converter import DocumentConverter Large -``` - -``` -source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL converter = DocumentConverter() result = converter.convert_single(source) print(result.render_as_markdown()) # output: "## DocLayNet: A Human -Annotated Dataset for Document -Layout Analysis [...]" -``` - -Optionally, you can configure custom pipeline features and runtime options, such as turning on or off features (e.g. OCR, table structure recognition), enforcing limits on the input document size, and defining the budget of CPU threads. Advanced usage examples and options are documented in the README file. Docling also provides a Dockerfile to demonstrate how to install and run it inside a container. - -## 3 Processing pipeline - -Docling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieves the programmatic text tokens, consisting of string content and its coordinates on the page, and also renders a bitmap image of each page to support downstream operations. Then, the standard model pipeline applies a sequence of AI models independently on every page in the document to extract features and content, such as layout and table structures. Finally, the results from all pages are aggregated and passed through a post-processing stage, which augments metadata, detects the document language, infers reading-order and eventually assembles a typed document object which can be serialized to JSON or Markdown. - -## 3.1 PDF backends - -Two basic requirements to process PDF documents in our pipeline are a) to retrieve all text content and their geometric coordinates on each page and b) to render the visual representation of each page as it would appear in a PDF viewer. Both these requirements are encapsulated in Docling's PDF backend interface. While there are several open-source PDF parsing libraries available for python, we faced major obstacles with all of them for different reasons, among which were restrictive - -1 see huggingface.co/ds4sd/docling-models/ - In this image, we can see some text and images. -Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible. - - - -licensing (e.g. pymupdf [7]), poor speed or unrecoverable quality issues, such as merged text cells across far-apart text tokens or table columns (pypdfium, PyPDF) [15, 14]. - -We therefore decided to provide multiple backend choices, and additionally open-source a custombuilt PDF parser, which is based on the low-level qpdf [4] library. It is made available in a separate package named docling-parse and powers the default PDF backend in Docling. As an alternative, we provide a PDF backend relying on pypdfium , which may be a safe backup choice in certain cases, e.g. if issues are seen with particular font encodings. - -## 3.2 AI models - -As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks. - -## Layout Analysis Model - -Our layout analysis model is an object-detector which predicts the bounding-boxes and classes of various elements on the image of a given page. Its architecture is derived from RT-DETR [16] and re-trained on DocLayNet [13], our popular human-annotated dataset for document-layout analysis, among other proprietary datasets. For inference, our implementation relies on the onnxruntime [5]. - -The Docling pipeline feeds page images at 72 dpi resolution, which can be processed on a single CPU with sub-second latency. All predicted bounding-box proposals for document elements are post-processed to remove overlapping proposals based on confidence and size, and then intersected with the text tokens in the PDF to group them into meaningful and complete units such as paragraphs, section titles, list items, captions, figures or tables. - -## Table Structure Recognition - -The TableFormer model [12], first published in 2022 and since refined with a custom structure token language [9], is a vision-transformer model for table structure recovery. It can predict the logical row and column structure of a given table based on an input image, and determine which table cells belong to column headers, row headers or the table body. Compared to earlier approaches, TableFormer handles many characteristics of tables, such as partial or no borderlines, empty cells, rows or columns, cell spans and hierarchy both on column-heading or row-heading level, tables with inconsistent indentation or alignment and other complexities. For inference, our implementation relies on PyTorch [2]. - -The Docling pipeline feeds all table objects detected in the layout analysis to the TableFormer model, by providing an image-crop of the table and the included text cells. TableFormer structure predictions are matched back to the PDF cells in post-processing to avoid expensive re-transcription text in the table image. Typical tables require between 2 and 6 seconds to be processed on a standard CPU, strongly depending on the amount of included table cells. - -## OCR - -Docling provides optional support for OCR, for example to cover scanned PDFs or content in bitmaps images embedded on a page. In our initial release, we rely on EasyOCR [1], a popular thirdparty OCR library with support for many languages. Docling, by default, feeds a high-resolution page image (216 dpi) to the OCR engine, to allow capturing small print detail in decent quality. While EasyOCR delivers reasonable transcription quality, we observe that it runs fairly slow on CPU (upwards of 30 seconds per page). - -We are actively seeking collaboration from the open-source community to extend Docling with additional OCR backends and speed improvements. - -## 3.3 Assembly - -In the final pipeline stage, Docling assembles all prediction results produced on each page into a well-defined datatype that encapsulates a converted document, as defined in the auxiliary package docling-core . The generated document object is passed through a post-processing model which leverages several algorithms to augment features, such as detection of the document language, correcting the reading order, matching figures with captions and labelling metadata such as title, authors and references. The final output can then be serialized to JSON or transformed into a Markdown representation at the users request. - -## 3.4 Extensibility - -Docling provides a straight-forward interface to extend its capabilities, namely the model pipeline. A model pipeline constitutes the central part in the processing, following initial document parsing and preceding output assembly, and can be fully customized by sub-classing from an abstract baseclass ( BaseModelPipeline ) or cloning the default model pipeline. This effectively allows to fully customize the chain of models, add or replace models, and introduce additional pipeline configuration parameters. To use a custom model pipeline, the custom pipeline class to instantiate can be provided as an argument to the main document conversion methods. We invite everyone in the community to propose additional or alternative models and improvements. - -Implementations of model classes must satisfy the python Callable interface. The \_\_call\_\_ method must accept an iterator over page objects, and produce another iterator over the page objects which were augmented with the additional features predicted by the model, by extending the provided PagePredictions data model accordingly. - -## 4 Performance - -In this section, we establish some reference numbers for the processing speed of Docling and the resource budget it requires. All tests in this section are run with default options on our standard test set distributed with Docling, which consists of three papers from arXiv and two IBM Redbooks, with a total of 225 pages. Measurements were taken using both available PDF backends on two different hardware systems: one MacBook Pro M3 Max, and one bare-metal server running Ubuntu 20.04 LTS on an Intel Xeon E5-2690 CPU. For reproducibility, we fixed the thread budget (through setting OMP NUM THREADS environment variable ) once to 4 (Docling default) and once to 16 (equal to full core count on the test hardware). All results are shown in Table 1. - -If you need to run Docling in very low-resource environments, please consider configuring the pypdfium backend. While it is faster and more memory efficient than the default docling-parse backend, it will come at the expense of worse quality results, especially in table structure recovery. - -Establishing GPU acceleration support for the AI models is currently work-in-progress and largely untested, but may work implicitly when CUDA is available and discovered by the onnxruntime and - torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report. {'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} @@ -132,141 +49,16 @@ We encourage everyone to propose or implement additional features and models, an - [1] J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/ JaidedAI/EasyOCR , 2024. Version: 1.7.0. - [2] J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. Voznesensky, B. Bao, P. Bell, D. Berard, E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison, W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. Lazos, M. Lezcano, Y. Liang, J. Liang, Y. Lu, C. Luk, B. Maher, Y. Pan, C. Puhrsch, M. Reso, M. Saroufim, M. Y. Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang, X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S. Chintala. Pytorch 2: Faster -machine learning through dynamic python bytecode transformation and graph compilation. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2 (ASPLOS '24) . ACM, 4 2024. doi: 10.1145/3620665.3640366. URL https://pytorch.org/assets/pytorch2-2.pdf . - -- [3] C. Auer, M. Dolfi, A. Carvalho, C. B. Ramis, and P. W. Staar. Delivering document conversion as a cloud service with high throughput and responsiveness. In 2022 IEEE 15th International Conference on Cloud Computing (CLOUD) , pages 363-373. IEEE, 2022. -- [4] J. Berkenbilt. Qpdf: A content-preserving pdf document transformer, 2024. URL https: //github.com/qpdf/qpdf . -- [5] O. R. developers. Onnx runtime. https://onnxruntime.ai/ , 2024. Version: 1.18.1. -- [6] IBM. Data Prep Kit: a community project to democratize and accelerate unstructured data preparation for LLM app developers, 2024. URL https://github.com/IBM/ data-prep-kit . -- [7] A. S. Inc. PyMuPDF, 2024. URL https://github.com/pymupdf/PyMuPDF . -- [8] J. Liu. LlamaIndex, 11 2022. URL https://github.com/jerryjliu/llama\_index . -- [9] M. Lysak, A. Nassar, N. Livathinos, C. Auer, and P. Staar. Optimized Table Tokenization for Table Structure Recognition. In Document Analysis and Recognition - ICDAR 2023: 17th International Conference, San Jos´ e, CA, USA, August 21-26, 2023, Proceedings, Part II , pages 37-50, Berlin, Heidelberg, Aug. 2023. Springer-Verlag. ISBN 978-3-031-41678-1. doi: 10. 1007/978-3-031-41679-8 3. URL https://doi.org/10.1007/978-3-031-41679-8\_3 . -- [10] L. Mishra, S. Dhibi, Y. Kim, C. Berrospi Ramis, S. Gupta, M. Dolfi, and P. Staar. Statements: Universal information extraction from tables with large language models for ESG KPIs. In D. Stammbach, J. Ni, T. Schimanski, K. Dutia, A. Singh, J. Bingler, C. Christiaen, N. Kushwaha, V. Muccione, S. A. Vaghefi, and M. Leippold, editors, Proceedings of the 1st Workshop on Natural Language Processing Meets Climate Change (ClimateNLP 2024) , pages 193-214, Bangkok, Thailand, Aug. 2024. Association for Computational Linguistics. URL https://aclanthology.org/2024.climatenlp-1.15 . -- [11] L. Morin, V. Weber, G. I. Meijer, F. Yu, and P. W. J. Staar. Patcid: an open-access dataset of chemical structures in patent documents. Nature Communications , 15(1):6532, August 2024. ISSN 2041-1723. doi: 10.1038/s41467-024-50779-y. URL https://doi.org/10.1038/ s41467-024-50779-y . -- [12] A. Nassar, N. Livathinos, M. Lysak, and P. Staar. Tableformer: Table structure understanding with transformers. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition , pages 4614-4623, 2022. -- [13] B. Pfitzmann, C. Auer, M. Dolfi, A. S. Nassar, and P. Staar. Doclaynet: a large humanannotated dataset for document-layout segmentation. pages 3743-3751, 2022. -- [14] pypdf Maintainers. pypdf: A Pure-Python PDF Library, 2024. URL https://github.com/ py-pdf/pypdf . -- [15] P. Team. PyPDFium2: Python bindings for PDFium, 2024. URL https://github.com/ pypdfium2-team/pypdfium2 . -- [16] Y. Zhao, W. Lv, S. Xu, J. Wei, G. Wang, Q. Dang, Y. Liu, and J. Chen. Detrs beat yolos on real-time object detection, 2023. - -## Appendix - -In this section, we illustrate a few examples of Docling's output in Markdown and JSON. - -## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis - -## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis - -Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com - -Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com - -Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com - -Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com - -Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com - -## ABSTRACT - -Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large ground-truth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis. - -## CCS CONCEPTS - -· Informationsystems → Documentstructure ; · Appliedcomputing → Document analysis ; · Computing methodologies → Machine learning Computer vision ; ; Object detection ; - -Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s). KDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043 - -Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com - -Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com - -Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com - -Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com - -Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com - -## ABSTRACT - -Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large groundtruth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis. - -## CCS CONCEPTS - -Æ Information systems → Document structure ; Æ Applied computing → Document analysis ; Æ Computing methodologies → Machine learning ; Computer vision ; Object detection ; - -Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s). - -KDD '22, August 14-18, 2022, Washington, DC, USA ' 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043 - -Figure 1: Four examples of complex page layouts across different document categories - -## KEYWORDS - -PDF document conversion, layout segmentation, object-detection, data set, Machine Learning - -## ACM Reference Format: - -Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043 - In this image there is a table with some text on it. - - In this image we can see a text. - - -AGL Energy Limited ABN 74 1 - -5 061 375 - In this image I can see the cover of the book. - - In this image there is a paper with some text on it. - - -Figure 1: Four examples of complex page layouts across different document categories - -## KEYWORDS - -PDF document conversion, layout segmentation, object-detection, data set, Machine Learning - -## ACMReference Format: - -Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043 - -1 INTRODUCTION - -Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1. Figure 2: Title page of the DocLayNet paper (arxiv.org/pdf/2206.01062) - left PDF, right rendered Markdown. If recognized, metadata such as authors are appearing first under the title. Text content inside figures is currently dropped, the caption is retained and linked to the figure in the JSON representation (not shown). - -KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar - -Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset. - -| | human | MRCNN R50 R101 | FRCNN R101 | YOLO v5x6 | -|--------------------------------------------------------------------------------------------------------|-------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------|--------------------------------------------------------| -| Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All | 84-89 83-91 83-85 87-88 93-94 85-89 69-71 83-84 77-81 84-86 | 68.4 71.5 70.9 60.1 63.4 81.2 80.8 61.6 59.3 71.9 70.0 71.7 72.7 67.6 69.3 82.2 82.9 85.8 76.7 80.4 72.4 73.5 | 70.1 73.7 63.5 81.0 58.9 72.0 72.0 68.4 82.2 85.4 79.9 73.4 | 77.7 77.2 66.2 86.2 61.1 67.9 74.6 86.3 88.1 82.7 76.8 | - -to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity. - -## 5 EXPERIMENTS - -The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this - In this image, we can see a table with some text. - - -Third, achienec - -## EXPERIMENTS - -chalenongayouls ground-vuth dawa such WC - The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. @@ -277,140 +69,16 @@ The graph has two lines: one for the training program and one for the percentage - **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. - **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%. - - -Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions. - -paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work. - -In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16]. - -## Baselines for Object Detection - -In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 × 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document. - -coioct dcochon modols - -## Baselines for Object Detection - -mak enbrel - -Figure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table. - -KDD '22, August 14-18, 2022, Washington, DC, USA - -Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar - -Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % - -between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges. - -of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric - The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A" - - In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318. - - -| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | -|----------------|---------|--------------|--------------|--------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------| -| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten | -| Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a | -| Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 | -| Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a | -| List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 | -| Page-footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 | -| Page-header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 | -| Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 | -| Section-header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 | -| Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 | -| Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 | -| Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 | -| Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 | - In this image I can see a blue circle. - - -include publication repositories such as arXiv - -Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple- - -annotated pages, from which we obtain accuracy ranges. - A table with different columns and rows. - - -| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | -|-----------------|---------|--------------|--------------|--------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------| -| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten | -| Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a | -| Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 | -| Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a | -| List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 | -| Page- footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 | -| Page- header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 | -| Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 | -| Section- header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 | -| Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 | -| Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 | -| Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 | -| Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 | - -3 - -, - -government offices, - -We reviewed the col- - -, - -Page- - -Title and - -. - -page. Specificity ensures that the choice of label is not ambiguous, - In this image there is a text in the middle. - - - -we distributed the annotation workload and performed continuous be annotated. We refrained from class labels that are very specific - -only. For phases three and four, a group of 40 dedicated annotators while coverage ensures that all meaningful items on a page can - -quality controls. Phase one and two required a small team of experts to a document category, such as - -Abstract in the - -Scientific Articles were assembled and supervised. - -category. We also avoided class labels that are tightly linked to the - -Phase 1: Data selection and preparation. - -Our inclusion cri- - -Author - -Affiliation - -teria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources in DocBank, are often only distinguishable by discriminating on 3 https://arxiv.org/ Figure 4: Table 1 from the DocLayNet paper in the original PDF (A), as rendered Markdown (B) and in JSON representation (C). Spanning table cells, such as the multi-column header 'triple interannotator mAP@0.5-0.95 (%)', is repeated for each column in the Markdown representation (B), which guarantees that every data point can be traced back to row and column headings only by its grid coordinates in the table. In the JSON representation, the span information is reflected in the fields of each table cell (C). - -semantics of the text. Labels such as and - -, - -as seen diff --git a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_true.gt.html b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_true.gt.html index 9dc6891e..0bb79d05 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_true.gt.html +++ b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_true.gt.html @@ -13,191 +13,6 @@

Abstract

1 Introduction

Converting PDF documents back into a machine-processable format has been a major challenge for decades due to their huge variability in formats, weak standardization and printing-optimized characteristic, which discards most structural features and metadata. With the advent of LLMs and popular application patterns such as retrieval-augmented generation (RAG), leveraging the rich content embedded in PDFs has become ever more relevant. In the past decade, several powerful document understanding solutions have emerged on the market, most of which are commercial software, cloud offerings [3] and most recently, multi-modal vision-language models. As of today, only a handful of open-source tools cover PDF conversion, leaving a significant feature and quality gap to proprietary solutions.

With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models.

-

Here is what Docling delivers today:

-
    -
  • Converts PDF documents to JSON or Markdown format, stable and lightning fast
  • -
  • Understands detailed page layout, reading order, locates figures and recovers table structures
  • -
  • Extracts metadata from the document, such as title, authors, references and language
  • -
  • Optionally applies OCR, e.g. for scanned PDFs
  • -
  • Can be configured to be optimal for batch-mode (i.e high throughput, low time-to-solution) or interactive mode (compromise on efficiency, low time-to-solution)
  • -
  • Can leverage different accelerators (GPU, MPS, etc).
  • -
-

2 Getting Started

-To use Docling, you can simply install the docling package from PyPI. Documentation and examples are available in our GitHub repository at github.com/DS4SD/docling . All required model assets 1 are downloaded to a local huggingface datasets cache on first use, unless you choose to pre-install the model assets in advance. -

Docling provides an easy code interface to convert PDF documents from file system, URLs or binary streams, and retrieve the output in either JSON or Markdown format. For convenience, separate methods are offered to convert single documents or batches of documents. A basic usage example is illustrated below. Further examples are available in the Doclign code repository.

-
from docling.document_converter import DocumentConverter Large
-
source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL converter = DocumentConverter() result = converter.convert_single(source) print(result.render_as_markdown()) # output: "## DocLayNet: A Human -Annotated Dataset for Document -Layout Analysis [...]"
-

Optionally, you can configure custom pipeline features and runtime options, such as turning on or off features (e.g. OCR, table structure recognition), enforcing limits on the input document size, and defining the budget of CPU threads. Advanced usage examples and options are documented in the README file. Docling also provides a Dockerfile to demonstrate how to install and run it inside a container.

-

3 Processing pipeline

-

Docling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieves the programmatic text tokens, consisting of string content and its coordinates on the page, and also renders a bitmap image of each page to support downstream operations. Then, the standard model pipeline applies a sequence of AI models independently on every page in the document to extract features and content, such as layout and table structures. Finally, the results from all pages are aggregated and passed through a post-processing stage, which augments metadata, detects the document language, infers reading-order and eventually assembles a typed document object which can be serialized to JSON or Markdown.

-

3.1 PDF backends

-

Two basic requirements to process PDF documents in our pipeline are a) to retrieve all text content and their geometric coordinates on each page and b) to render the visual representation of each page as it would appear in a PDF viewer. Both these requirements are encapsulated in Docling's PDF backend interface. While there are several open-source PDF parsing libraries available for python, we faced major obstacles with all of them for different reasons, among which were restrictive

-

1 see huggingface.co/ds4sd/docling-models/

-
Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.
In this image, we can see some text and images.
-

licensing (e.g. pymupdf [7]), poor speed or unrecoverable quality issues, such as merged text cells across far-apart text tokens or table columns (pypdfium, PyPDF) [15, 14].

-

We therefore decided to provide multiple backend choices, and additionally open-source a custombuilt PDF parser, which is based on the low-level qpdf [4] library. It is made available in a separate package named docling-parse and powers the default PDF backend in Docling. As an alternative, we provide a PDF backend relying on pypdfium , which may be a safe backup choice in certain cases, e.g. if issues are seen with particular font encodings.

-

3.2 AI models

-

As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks.

-

Layout Analysis Model

-

Our layout analysis model is an object-detector which predicts the bounding-boxes and classes of various elements on the image of a given page. Its architecture is derived from RT-DETR [16] and re-trained on DocLayNet [13], our popular human-annotated dataset for document-layout analysis, among other proprietary datasets. For inference, our implementation relies on the onnxruntime [5].

-

The Docling pipeline feeds page images at 72 dpi resolution, which can be processed on a single CPU with sub-second latency. All predicted bounding-box proposals for document elements are post-processed to remove overlapping proposals based on confidence and size, and then intersected with the text tokens in the PDF to group them into meaningful and complete units such as paragraphs, section titles, list items, captions, figures or tables.

-

Table Structure Recognition

-

The TableFormer model [12], first published in 2022 and since refined with a custom structure token language [9], is a vision-transformer model for table structure recovery. It can predict the logical row and column structure of a given table based on an input image, and determine which table cells belong to column headers, row headers or the table body. Compared to earlier approaches, TableFormer handles many characteristics of tables, such as partial or no borderlines, empty cells, rows or columns, cell spans and hierarchy both on column-heading or row-heading level, tables with inconsistent indentation or alignment and other complexities. For inference, our implementation relies on PyTorch [2].

-

The Docling pipeline feeds all table objects detected in the layout analysis to the TableFormer model, by providing an image-crop of the table and the included text cells. TableFormer structure predictions are matched back to the PDF cells in post-processing to avoid expensive re-transcription text in the table image. Typical tables require between 2 and 6 seconds to be processed on a standard CPU, strongly depending on the amount of included table cells.

-

OCR

-

Docling provides optional support for OCR, for example to cover scanned PDFs or content in bitmaps images embedded on a page. In our initial release, we rely on EasyOCR [1], a popular thirdparty OCR library with support for many languages. Docling, by default, feeds a high-resolution page image (216 dpi) to the OCR engine, to allow capturing small print detail in decent quality. While EasyOCR delivers reasonable transcription quality, we observe that it runs fairly slow on CPU (upwards of 30 seconds per page).

-

We are actively seeking collaboration from the open-source community to extend Docling with additional OCR backends and speed improvements.

-

3.3 Assembly

-

In the final pipeline stage, Docling assembles all prediction results produced on each page into a well-defined datatype that encapsulates a converted document, as defined in the auxiliary package docling-core . The generated document object is passed through a post-processing model which leverages several algorithms to augment features, such as detection of the document language, correcting the reading order, matching figures with captions and labelling metadata such as title, authors and references. The final output can then be serialized to JSON or transformed into a Markdown representation at the users request.

-

3.4 Extensibility

-

Docling provides a straight-forward interface to extend its capabilities, namely the model pipeline. A model pipeline constitutes the central part in the processing, following initial document parsing and preceding output assembly, and can be fully customized by sub-classing from an abstract baseclass ( BaseModelPipeline ) or cloning the default model pipeline. This effectively allows to fully customize the chain of models, add or replace models, and introduce additional pipeline configuration parameters. To use a custom model pipeline, the custom pipeline class to instantiate can be provided as an argument to the main document conversion methods. We invite everyone in the community to propose additional or alternative models and improvements.

-

Implementations of model classes must satisfy the python Callable interface. The __call__ method must accept an iterator over page objects, and produce another iterator over the page objects which were augmented with the additional features predicted by the model, by extending the provided PagePredictions data model accordingly.

-

4 Performance

-

In this section, we establish some reference numbers for the processing speed of Docling and the resource budget it requires. All tests in this section are run with default options on our standard test set distributed with Docling, which consists of three papers from arXiv and two IBM Redbooks, with a total of 225 pages. Measurements were taken using both available PDF backends on two different hardware systems: one MacBook Pro M3 Max, and one bare-metal server running Ubuntu 20.04 LTS on an Intel Xeon E5-2690 CPU. For reproducibility, we fixed the thread budget (through setting OMP NUM THREADS environment variable ) once to 4 (Docling default) and once to 16 (equal to full core count on the test hardware). All results are shown in Table 1.

-

If you need to run Docling in very low-resource environments, please consider configuring the pypdfium backend. While it is faster and more memory efficient than the default docling-parse backend, it will come at the expense of worse quality results, especially in table structure recovery.

-

Establishing GPU acceleration support for the AI models is currently work-in-progress and largely untested, but may work implicitly when CUDA is available and discovered by the onnxruntime and

-

torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report.

-
Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads.
CPUThread budgetnative backendpypdfium backend
TTSPages/sMemTTSPages/sMem
Apple M3 Max4177 s 167 s1.27 1.346.20 GB103 s 92 s2.18 2.452.56 GB
(16 cores) Intel(R) Xeon E5-269016 4 16375 s 244 s0.60 0.926.16 GB239 s 143 s0.94 1.572.42 GB
-

5 Applications

-

Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets.

-

6 Future work and contributions

-

Docling is designed to allow easy extension of the model library and pipelines. In the future, we plan to extend Docling with several more models, such as a figure-classifier model, an equationrecognition model, a code-recognition model and more. This will help improve the quality of conversion for specific types of content, as well as augment extracted document metadata with additional information. Further investment into testing and optimizing GPU acceleration as well as improving the Docling-native PDF backend are on our roadmap, too.

-

We encourage everyone to propose or implement additional features and models, and will gladly take your inputs and contributions under review . The codebase of Docling is open for use and contribution, under the MIT license agreement and in alignment with our contributing guidelines included in the Docling repository. If you use Docling in your projects, please consider citing this technical report.

-

References

-
    -
  1. J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/ JaidedAI/EasyOCR , 2024. Version: 1.7.0.
  2. -
  3. J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. Voznesensky, B. Bao, P. Bell, D. Berard, E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison, W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. Lazos, M. Lezcano, Y. Liang, J. Liang, Y. Lu, C. Luk, B. Maher, Y. Pan, C. Puhrsch, M. Reso, M. Saroufim, M. Y. Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang, X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S. Chintala. Pytorch 2: Faster
  4. -
-

machine learning through dynamic python bytecode transformation and graph compilation. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2 (ASPLOS '24) . ACM, 4 2024. doi: 10.1145/3620665.3640366. URL https://pytorch.org/assets/pytorch2-2.pdf .

-
    -
  1. C. Auer, M. Dolfi, A. Carvalho, C. B. Ramis, and P. W. Staar. Delivering document conversion as a cloud service with high throughput and responsiveness. In 2022 IEEE 15th International Conference on Cloud Computing (CLOUD) , pages 363-373. IEEE, 2022.
  2. -
  3. J. Berkenbilt. Qpdf: A content-preserving pdf document transformer, 2024. URL https: //github.com/qpdf/qpdf .
  4. -
  5. O. R. developers. Onnx runtime. https://onnxruntime.ai/ , 2024. Version: 1.18.1.
  6. -
  7. IBM. Data Prep Kit: a community project to democratize and accelerate unstructured data preparation for LLM app developers, 2024. URL https://github.com/IBM/ data-prep-kit .
  8. -
  9. A. S. Inc. PyMuPDF, 2024. URL https://github.com/pymupdf/PyMuPDF .
  10. -
  11. J. Liu. LlamaIndex, 11 2022. URL https://github.com/jerryjliu/llama_index .
  12. -
  13. M. Lysak, A. Nassar, N. Livathinos, C. Auer, and P. Staar. Optimized Table Tokenization for Table Structure Recognition. In Document Analysis and Recognition - ICDAR 2023: 17th International Conference, San Jos´ e, CA, USA, August 21-26, 2023, Proceedings, Part II , pages 37-50, Berlin, Heidelberg, Aug. 2023. Springer-Verlag. ISBN 978-3-031-41678-1. doi: 10. 1007/978-3-031-41679-8 3. URL https://doi.org/10.1007/978-3-031-41679-8_3 .
  14. -
  15. L. Mishra, S. Dhibi, Y. Kim, C. Berrospi Ramis, S. Gupta, M. Dolfi, and P. Staar. Statements: Universal information extraction from tables with large language models for ESG KPIs. In D. Stammbach, J. Ni, T. Schimanski, K. Dutia, A. Singh, J. Bingler, C. Christiaen, N. Kushwaha, V. Muccione, S. A. Vaghefi, and M. Leippold, editors, Proceedings of the 1st Workshop on Natural Language Processing Meets Climate Change (ClimateNLP 2024) , pages 193-214, Bangkok, Thailand, Aug. 2024. Association for Computational Linguistics. URL https://aclanthology.org/2024.climatenlp-1.15 .
  16. -
  17. L. Morin, V. Weber, G. I. Meijer, F. Yu, and P. W. J. Staar. Patcid: an open-access dataset of chemical structures in patent documents. Nature Communications , 15(1):6532, August 2024. ISSN 2041-1723. doi: 10.1038/s41467-024-50779-y. URL https://doi.org/10.1038/ s41467-024-50779-y .
  18. -
  19. A. Nassar, N. Livathinos, M. Lysak, and P. Staar. Tableformer: Table structure understanding with transformers. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition , pages 4614-4623, 2022.
  20. -
  21. B. Pfitzmann, C. Auer, M. Dolfi, A. S. Nassar, and P. Staar. Doclaynet: a large humanannotated dataset for document-layout segmentation. pages 3743-3751, 2022.
  22. -
  23. pypdf Maintainers. pypdf: A Pure-Python PDF Library, 2024. URL https://github.com/ py-pdf/pypdf .
  24. -
  25. P. Team. PyPDFium2: Python bindings for PDFium, 2024. URL https://github.com/ pypdfium2-team/pypdfium2 .
  26. -
  27. Y. Zhao, W. Lv, S. Xu, J. Wei, G. Wang, Q. Dang, Y. Liu, and J. Chen. Detrs beat yolos on real-time object detection, 2023.
  28. -
-

Appendix

-

In this section, we illustrate a few examples of Docling's output in Markdown and JSON.

-

DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis

-

DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis

-

Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com

-

Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com

-

Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com

-

Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com

-

Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com

-

ABSTRACT

-

Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large ground-truth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis.

-

CCS CONCEPTS

-

· Informationsystems → Documentstructure ; · Appliedcomputing → Document analysis ; · Computing methodologies → Machine learning Computer vision ; ; Object detection ;

-

Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s). KDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043

-

Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com

-

Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com

-

Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com

-

Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com

-

Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com

-

ABSTRACT

-

Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large groundtruth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis.

-

CCS CONCEPTS

-

Æ Information systems → Document structure ; Æ Applied computing → Document analysis ; Æ Computing methodologies → Machine learning ; Computer vision ; Object detection ;

-

Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s).

-

KDD '22, August 14-18, 2022, Washington, DC, USA ' 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043

-

Figure 1: Four examples of complex page layouts across different document categories

-

KEYWORDS

-

PDF document conversion, layout segmentation, object-detection, data set, Machine Learning

-

ACM Reference Format:

-

Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043

-
In this image there is a table with some text on it.
-
In this image we can see a text.
-

AGL Energy Limited ABN 74 1

-

5 061 375

-
In this image I can see the cover of the book.
-
In this image there is a paper with some text on it.
-

Figure 1: Four examples of complex page layouts across different document categories

-

KEYWORDS

-

PDF document conversion, layout segmentation, object-detection, data set, Machine Learning

-

ACMReference Format:

-

Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043

-

1 INTRODUCTION

-

Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1. Figure 2: Title page of the DocLayNet paper (arxiv.org/pdf/2206.01062) - left PDF, right rendered Markdown. If recognized, metadata such as authors are appearing first under the title. Text content inside figures is currently dropped, the caption is retained and linked to the figure in the JSON representation (not shown).

-

KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar

-

Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.

-
humanMRCNN R50 R101FRCNN R101YOLO v5x6
Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All84-89 83-91 83-85 87-88 93-94 85-89 69-71 83-84 77-81 84-8668.4 71.5 70.9 60.1 63.4 81.2 80.8 61.6 59.3 71.9 70.0 71.7 72.7 67.6 69.3 82.2 82.9 85.8 76.7 80.4 72.4 73.570.1 73.7 63.5 81.0 58.9 72.0 72.0 68.4 82.2 85.4 79.9 73.477.7 77.2 66.2 86.2 61.1 67.9 74.6 86.3 88.1 82.7 76.8
-

to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity.

-

5 EXPERIMENTS

-

The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this

-
In this image, we can see a table with some text.
-

Third, achienec

-

EXPERIMENTS

-

chalenongayouls ground-vuth dawa such WC

-
The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. - -The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. - -### Analysis: - -#### Training Program: -- **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. -- **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%.
-

Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.

-

paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.

-

In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16].

-

Baselines for Object Detection

-

In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 × 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document.

-

coioct dcochon modols

-

Baselines for Object Detection

-

mak enbrel

-

Figure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table.

-

KDD '22, August 14-18, 2022, Washington, DC, USA

-

Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar

-

Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as %

-

between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.

-

of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric

-
The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. - -The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. - -The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A"
-
In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318.
-
class labelCount% of Totaltriple inter-annotator mAP @ 0.5-0.95 (%)
TrainTestValAllFinManSciLawPatTen
Caption225242.041.772.3284-8940-6186-9294-9995-9969-78n/a
Footnote63180.600.310.5883-91n/a10062-8885-94n/a82-97
Formula250272.251.902.9683-85n/an/a84-8786-96n/an/a
List-item18566017.1913.3415.8287-8874-8390-9297-9781-8575-8893-95
Page-footer708786.515.586.0093-9488-9095-9610092-9710096-98
Page-header580225.106.705.0685-8966-7690-9498-10091-9297-9981-86
Picture459764.212.785.3169-7156-5982-8669-8280-9566-7159-76
Section-header14288412.6015.7712.8583-8476-8190-9294-9587-9469-7378-86
Table347333.202.273.6077-8175-8083-8698-9958-8079-8470-85
Text51037745.8249.2845.0084-8681-8688-9389-9387-9271-7987-95
Title50710.470.300.5060-7224-6350-6394-10082-9668-7924-56
Total1107470941123998166653182-8371-7479-8189-9486-9171-7668-85
-
In this image I can see a blue circle.
-

include publication repositories such as arXiv

-

Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-

-

annotated pages, from which we obtain accuracy ranges.

-
A table with different columns and rows.
-
% of Total% of Total% of Totaltriple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)
class labelCountTrainTestValAllFinManSciLawPatTen
Caption225242.041.772.3284-8940-6186-9294-9995-9969-78n/a
Footnote63180.600.310.5883-91n/a10062-8885-94n/a82-97
Formula250272.251.902.9683-85n/an/a84-8786-96n/an/a
List-item18566017.1913.3415.8287-8874-8390-9297-9781-8575-8893-95
Page- footer708786.515.586.0093-9488-9095-9610092-9710096-98
Page- header580225.106.705.0685-8966-7690-9498-10091-9297-9981-86
Picture459764.212.785.3169-7156-5982-8669-8280-9566-7159-76
Section- header14288412.6015.7712.8583-8476-8190-9294-9587-9469-7378-86
Table347333.202.273.6077-8175-8083-8698-9958-8079-8470-85
Text51037745.8249.2845.0084-8681-8688-9389-9387-9271-7987-95
Title50710.470.300.5060-7224-6350-6394-10082-9668-7924-56
Total1107470941123998166653182-8371-7479-8189-9486-9171-7668-85
-

3

-

,

-

government offices,

-

We reviewed the col-

-

,

-

Page-

-

Title and

-

.

-

page. Specificity ensures that the choice of label is not ambiguous,

-
In this image there is a text in the middle.
-

we distributed the annotation workload and performed continuous be annotated. We refrained from class labels that are very specific

-

only. For phases three and four, a group of 40 dedicated annotators while coverage ensures that all meaningful items on a page can

-

quality controls. Phase one and two required a small team of experts to a document category, such as

-

Abstract in the

-

Scientific Articles were assembled and supervised.

-

category. We also avoided class labels that are tightly linked to the

-

Phase 1: Data selection and preparation.

-

Our inclusion cri-

-

Author

-

Affiliation

-

teria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources in DocBank, are often only distinguishable by discriminating on 3 https://arxiv.org/ Figure 4: Table 1 from the DocLayNet paper in the original PDF (A), as rendered Markdown (B) and in JSON representation (C). Spanning table cells, such as the multi-column header 'triple interannotator mAP@0.5-0.95 (%)', is repeated for each column in the Markdown representation (B), which guarantees that every data point can be traced back to row and column headings only by its grid coordinates in the table. In the JSON representation, the span information is reflected in the fields of each table cell (C).

-

semantics of the text. Labels such as and

-

,

-

as seen

diff --git a/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md b/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md index 1b8cda26..8170fe3c 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md +++ b/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md @@ -22,93 +22,8 @@ Converting PDF documents back into a machine-processable format has been a major With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models. -Here is what Docling delivers today: - -- Converts PDF documents to JSON or Markdown format, stable and lightning fast -- Understands detailed page layout, reading order, locates figures and recovers table structures -- Extracts metadata from the document, such as title, authors, references and language -- Optionally applies OCR, e.g. for scanned PDFs -- Can be configured to be optimal for batch-mode (i.e high throughput, low time-to-solution) or interactive mode (compromise on efficiency, low time-to-solution) -- Can leverage different accelerators (GPU, MPS, etc). - -## 2 Getting Started - -To use Docling, you can simply install the docling package from PyPI. Documentation and examples are available in our GitHub repository at [github.com/DS4SD/docling](https://github.com/DS4SD/docling) . All required model assets 1 are downloaded to a local huggingface datasets cache on first use, unless you choose to pre-install the model assets in advance. - -Docling provides an easy code interface to convert PDF documents from file system, URLs or binary streams, and retrieve the output in either JSON or Markdown format. For convenience, separate methods are offered to convert single documents or batches of documents. A basic usage example is illustrated below. Further examples are available in the Doclign code repository. - -``` -from docling.document_converter import DocumentConverter Large -``` - -``` -source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL converter = DocumentConverter() result = converter.convert_single(source) print(result.render_as_markdown()) # output: "## DocLayNet: A Human -Annotated Dataset for Document -Layout Analysis [...]" -``` - -Optionally, you can configure custom pipeline features and runtime options, such as turning on or off features (e.g. OCR, table structure recognition), enforcing limits on the input document size, and defining the budget of CPU threads. Advanced usage examples and options are documented in the README file. Docling also provides a Dockerfile to demonstrate how to install and run it inside a container. - -## 3 Processing pipeline - -Docling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieves the programmatic text tokens, consisting of string content and its coordinates on the page, and also renders a bitmap image of each page to support downstream operations. Then, the standard model pipeline applies a sequence of AI models independently on every page in the document to extract features and content, such as layout and table structures. Finally, the results from all pages are aggregated and passed through a post-processing stage, which augments metadata, detects the document language, infers reading-order and eventually assembles a typed document object which can be serialized to JSON or Markdown. - -## 3.1 PDF backends - -Two basic requirements to process PDF documents in our pipeline are a) to retrieve all text content and their geometric coordinates on each page and b) to render the visual representation of each page as it would appear in a PDF viewer. Both these requirements are encapsulated in Docling's PDF backend interface. While there are several open-source PDF parsing libraries available for python, we faced major obstacles with all of them for different reasons, among which were restrictive - -1 see huggingface.co/ds4sd/docling-models/ - -In this image, we can see some text and images. - -Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible. - In this image, we can see some text and images. - - -licensing (e.g. pymupdf [7]), poor speed or unrecoverable quality issues, such as merged text cells across far-apart text tokens or table columns (pypdfium, PyPDF) [15, 14]. - -We therefore decided to provide multiple backend choices, and additionally open-source a custombuilt PDF parser, which is based on the low-level qpdf [4] library. It is made available in a separate package named docling-parse and powers the default PDF backend in Docling. As an alternative, we provide a PDF backend relying on pypdfium , which may be a safe backup choice in certain cases, e.g. if issues are seen with particular font encodings. - -## 3.2 AI models - -As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks. - -## Layout Analysis Model - -Our layout analysis model is an object-detector which predicts the bounding-boxes and classes of various elements on the image of a given page. Its architecture is derived from RT-DETR [16] and re-trained on DocLayNet [13], our popular human-annotated dataset for document-layout analysis, among other proprietary datasets. For inference, our implementation relies on the onnxruntime [5]. - -The Docling pipeline feeds page images at 72 dpi resolution, which can be processed on a single CPU with sub-second latency. All predicted bounding-box proposals for document elements are post-processed to remove overlapping proposals based on confidence and size, and then intersected with the text tokens in the PDF to group them into meaningful and complete units such as paragraphs, section titles, list items, captions, figures or tables. - -## Table Structure Recognition - -The TableFormer model [12], first published in 2022 and since refined with a custom structure token language [9], is a vision-transformer model for table structure recovery. It can predict the logical row and column structure of a given table based on an input image, and determine which table cells belong to column headers, row headers or the table body. Compared to earlier approaches, TableFormer handles many characteristics of tables, such as partial or no borderlines, empty cells, rows or columns, cell spans and hierarchy both on column-heading or row-heading level, tables with inconsistent indentation or alignment and other complexities. For inference, our implementation relies on PyTorch [2]. - -The Docling pipeline feeds all table objects detected in the layout analysis to the TableFormer model, by providing an image-crop of the table and the included text cells. TableFormer structure predictions are matched back to the PDF cells in post-processing to avoid expensive re-transcription text in the table image. Typical tables require between 2 and 6 seconds to be processed on a standard CPU, strongly depending on the amount of included table cells. - -## OCR - -Docling provides optional support for OCR, for example to cover scanned PDFs or content in bitmaps images embedded on a page. In our initial release, we rely on EasyOCR [1], a popular thirdparty OCR library with support for many languages. Docling, by default, feeds a high-resolution page image (216 dpi) to the OCR engine, to allow capturing small print detail in decent quality. While EasyOCR delivers reasonable transcription quality, we observe that it runs fairly slow on CPU (upwards of 30 seconds per page). - -We are actively seeking collaboration from the open-source community to extend Docling with additional OCR backends and speed improvements. - -## 3.3 Assembly - -In the final pipeline stage, Docling assembles all prediction results produced on each page into a well-defined datatype that encapsulates a converted document, as defined in the auxiliary package docling-core . The generated document object is passed through a post-processing model which leverages several algorithms to augment features, such as detection of the document language, correcting the reading order, matching figures with captions and labelling metadata such as title, authors and references. The final output can then be serialized to JSON or transformed into a Markdown representation at the users request. - -## 3.4 Extensibility - -Docling provides a straight-forward interface to extend its capabilities, namely the model pipeline. A model pipeline constitutes the central part in the processing, following initial document parsing and preceding output assembly, and can be fully customized by sub-classing from an abstract baseclass ( BaseModelPipeline ) or cloning the default model pipeline. This effectively allows to fully customize the chain of models, add or replace models, and introduce additional pipeline configuration parameters. To use a custom model pipeline, the custom pipeline class to instantiate can be provided as an argument to the main document conversion methods. We invite everyone in the community to propose additional or alternative models and improvements. - -Implementations of model classes must satisfy the python Callable interface. The \_\_call\_\_ method must accept an iterator over page objects, and produce another iterator over the page objects which were augmented with the additional features predicted by the model, by extending the provided PagePredictions data model accordingly. - -## 4 Performance - -In this section, we establish some reference numbers for the processing speed of Docling and the resource budget it requires. All tests in this section are run with default options on our standard test set distributed with Docling, which consists of three papers from arXiv and two IBM Redbooks, with a total of 225 pages. Measurements were taken using both available PDF backends on two different hardware systems: one MacBook Pro M3 Max, and one bare-metal server running Ubuntu 20.04 LTS on an Intel Xeon E5-2690 CPU. For reproducibility, we fixed the thread budget (through setting OMP NUM THREADS environment variable ) once to 4 (Docling default) and once to 16 (equal to full core count on the test hardware). All results are shown in Table 1. - -If you need to run Docling in very low-resource environments, please consider configuring the pypdfium backend. While it is faster and more memory efficient than the default docling-parse backend, it will come at the expense of worse quality results, especially in table structure recovery. - -Establishing GPU acceleration support for the AI models is currently work-in-progress and largely untested, but may work implicitly when CUDA is available and discovered by the onnxruntime and - torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report. {'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} @@ -139,151 +54,16 @@ We encourage everyone to propose or implement additional features and models, an - [1] J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/ JaidedAI/EasyOCR , 2024. Version: 1.7.0. - [2] J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. Voznesensky, B. Bao, P. Bell, D. Berard, E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison, W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. Lazos, M. Lezcano, Y. Liang, J. Liang, Y. Lu, C. Luk, B. Maher, Y. Pan, C. Puhrsch, M. Reso, M. Saroufim, M. Y. Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang, X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S. Chintala. Pytorch 2: Faster -machine learning through dynamic python bytecode transformation and graph compilation. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2 (ASPLOS '24) . ACM, 4 2024. doi: 10.1145/3620665.3640366. URL https://pytorch.org/assets/pytorch2-2.pdf . - -- [3] C. Auer, M. Dolfi, A. Carvalho, C. B. Ramis, and P. W. Staar. Delivering document conversion as a cloud service with high throughput and responsiveness. In 2022 IEEE 15th International Conference on Cloud Computing (CLOUD) , pages 363-373. IEEE, 2022. -- [4] J. Berkenbilt. Qpdf: A content-preserving pdf document transformer, 2024. URL https: //github.com/qpdf/qpdf . -- [5] O. R. developers. Onnx runtime. https://onnxruntime.ai/ , 2024. Version: 1.18.1. -- [6] IBM. Data Prep Kit: a community project to democratize and accelerate unstructured data preparation for LLM app developers, 2024. URL https://github.com/IBM/ data-prep-kit . -- [7] A. S. Inc. PyMuPDF, 2024. URL https://github.com/pymupdf/PyMuPDF . -- [8] J. Liu. LlamaIndex, 11 2022. URL https://github.com/jerryjliu/llama\_index . -- [9] M. Lysak, A. Nassar, N. Livathinos, C. Auer, and P. Staar. Optimized Table Tokenization for Table Structure Recognition. In Document Analysis and Recognition - ICDAR 2023: 17th International Conference, San Jos´ e, CA, USA, August 21-26, 2023, Proceedings, Part II , pages 37-50, Berlin, Heidelberg, Aug. 2023. Springer-Verlag. ISBN 978-3-031-41678-1. doi: 10. 1007/978-3-031-41679-8 3. URL https://doi.org/10.1007/978-3-031-41679-8\_3 . -- [10] L. Mishra, S. Dhibi, Y. Kim, C. Berrospi Ramis, S. Gupta, M. Dolfi, and P. Staar. Statements: Universal information extraction from tables with large language models for ESG KPIs. In D. Stammbach, J. Ni, T. Schimanski, K. Dutia, A. Singh, J. Bingler, C. Christiaen, N. Kushwaha, V. Muccione, S. A. Vaghefi, and M. Leippold, editors, Proceedings of the 1st Workshop on Natural Language Processing Meets Climate Change (ClimateNLP 2024) , pages 193-214, Bangkok, Thailand, Aug. 2024. Association for Computational Linguistics. URL https://aclanthology.org/2024.climatenlp-1.15 . -- [11] L. Morin, V. Weber, G. I. Meijer, F. Yu, and P. W. J. Staar. Patcid: an open-access dataset of chemical structures in patent documents. Nature Communications , 15(1):6532, August 2024. ISSN 2041-1723. doi: 10.1038/s41467-024-50779-y. URL https://doi.org/10.1038/ s41467-024-50779-y . -- [12] A. Nassar, N. Livathinos, M. Lysak, and P. Staar. Tableformer: Table structure understanding with transformers. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition , pages 4614-4623, 2022. -- [13] B. Pfitzmann, C. Auer, M. Dolfi, A. S. Nassar, and P. Staar. Doclaynet: a large humanannotated dataset for document-layout segmentation. pages 3743-3751, 2022. -- [14] pypdf Maintainers. pypdf: A Pure-Python PDF Library, 2024. URL https://github.com/ py-pdf/pypdf . -- [15] P. Team. PyPDFium2: Python bindings for PDFium, 2024. URL https://github.com/ pypdfium2-team/pypdfium2 . -- [16] Y. Zhao, W. Lv, S. Xu, J. Wei, G. Wang, Q. Dang, Y. Liu, and J. Chen. Detrs beat yolos on real-time object detection, 2023. - -## Appendix - -In this section, we illustrate a few examples of Docling's output in Markdown and JSON. - -## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis - -## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis - -Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com - -Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com - -Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com - -Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com - -Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com - -## ABSTRACT - -Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large ground-truth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis. - -## CCS CONCEPTS - -· Informationsystems → Documentstructure ; · Appliedcomputing → Document analysis ; · Computing methodologies → Machine learning Computer vision ; ; Object detection ; - -Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s). KDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043 - -Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com - -Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com - -Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com - -Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com - -Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com - -## ABSTRACT - -Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large groundtruth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis. - -## CCS CONCEPTS - -Æ Information systems → Document structure ; Æ Applied computing → Document analysis ; Æ Computing methodologies → Machine learning ; Computer vision ; Object detection ; - -Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s). - -KDD '22, August 14-18, 2022, Washington, DC, USA ' 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043 - -Figure 1: Four examples of complex page layouts across different document categories - -## KEYWORDS - -PDF document conversion, layout segmentation, object-detection, data set, Machine Learning - -## ACM Reference Format: - -Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043 - -In this image there is a table with some text on it. - In this image there is a table with some text on it. - - -In this image we can see a text. - In this image we can see a text. - - -AGL Energy Limited ABN 74 1 - -5 061 375 - In this image I can see the cover of the book. -In this image I can see the cover of the book. - - - In this image there is a paper with some text on it. -In this image there is a paper with some text on it. - - - -Figure 1: Four examples of complex page layouts across different document categories - -## KEYWORDS - -PDF document conversion, layout segmentation, object-detection, data set, Machine Learning - -## ACMReference Format: - -Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043 - -1 INTRODUCTION - -Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1. Figure 2: Title page of the DocLayNet paper (arxiv.org/pdf/2206.01062) - left PDF, right rendered Markdown. If recognized, metadata such as authors are appearing first under the title. Text content inside figures is currently dropped, the caption is retained and linked to the figure in the JSON representation (not shown). - -KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar - -Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset. - -| | human | MRCNN R50 R101 | FRCNN R101 | YOLO v5x6 | -|--------------------------------------------------------------------------------------------------------|-------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------|--------------------------------------------------------| -| Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All | 84-89 83-91 83-85 87-88 93-94 85-89 69-71 83-84 77-81 84-86 | 68.4 71.5 70.9 60.1 63.4 81.2 80.8 61.6 59.3 71.9 70.0 71.7 72.7 67.6 69.3 82.2 82.9 85.8 76.7 80.4 72.4 73.5 | 70.1 73.7 63.5 81.0 58.9 72.0 72.0 68.4 82.2 85.4 79.9 73.4 | 77.7 77.2 66.2 86.2 61.1 67.9 74.6 86.3 88.1 82.7 76.8 | - -to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity. - -## 5 EXPERIMENTS - -The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this - In this image, we can see a table with some text. -In this image, we can see a table with some text. - - - -Third, achienec - -## EXPERIMENTS - -chalenongayouls ground-vuth dawa such WC - The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. @@ -294,164 +74,16 @@ The graph has two lines: one for the training program and one for the percentage - **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. - **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%. -The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. - -The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. - -### Analysis: - -#### Training Program: -- **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. -- **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%. - - - -Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions. - -paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work. - -In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16]. - -## Baselines for Object Detection - -In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 × 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document. - -coioct dcochon modols - -## Baselines for Object Detection - -mak enbrel - -Figure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table. - -KDD '22, August 14-18, 2022, Washington, DC, USA - -Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar - -Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % - -between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges. - -of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric - -The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. - -The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. - -The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A" - The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A" - - In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318. -In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318. - - - -| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | -|----------------|---------|--------------|--------------|--------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------| -| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten | -| Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a | -| Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 | -| Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a | -| List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 | -| Page-footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 | -| Page-header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 | -| Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 | -| Section-header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 | -| Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 | -| Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 | -| Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 | -| Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 | - -In this image I can see a blue circle. - In this image I can see a blue circle. - - -include publication repositories such as arXiv - -Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple- - -annotated pages, from which we obtain accuracy ranges. - A table with different columns and rows. -A table with different columns and rows. - - - -| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | -|-----------------|---------|--------------|--------------|--------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------| -| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten | -| Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a | -| Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 | -| Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a | -| List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 | -| Page- footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 | -| Page- header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 | -| Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 | -| Section- header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 | -| Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 | -| Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 | -| Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 | -| Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 | - -3 - -, - -government offices, - -We reviewed the col- - -, - -Page- - -Title and - -. - -page. Specificity ensures that the choice of label is not ambiguous, - -In this image there is a text in the middle. - In this image there is a text in the middle. - - - -we distributed the annotation workload and performed continuous be annotated. We refrained from class labels that are very specific - -only. For phases three and four, a group of 40 dedicated annotators while coverage ensures that all meaningful items on a page can - -quality controls. Phase one and two required a small team of experts to a document category, such as - -Abstract in the - -Scientific Articles were assembled and supervised. - -category. We also avoided class labels that are tightly linked to the - -Phase 1: Data selection and preparation. - -Our inclusion cri- - -Author - -Affiliation - -teria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources in DocBank, are often only distinguishable by discriminating on 3 https://arxiv.org/ Figure 4: Table 1 from the DocLayNet paper in the original PDF (A), as rendered Markdown (B) and in JSON representation (C). Spanning table cells, such as the multi-column header 'triple interannotator mAP@0.5-0.95 (%)', is repeated for each column in the Markdown representation (B), which guarantees that every data point can be traced back to row and column headings only by its grid coordinates in the table. In the JSON representation, the span information is reflected in the fields of each table cell (C). - -semantics of the text. Labels such as and - -, - -as seen diff --git a/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_true.gt.md b/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_true.gt.md index 10c9ce4d..d1d5e8b5 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_true.gt.md +++ b/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_true.gt.md @@ -22,93 +22,8 @@ Converting PDF documents back into a machine-processable format has been a major With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models. -Here is what Docling delivers today: - -- Converts PDF documents to JSON or Markdown format, stable and lightning fast -- Understands detailed page layout, reading order, locates figures and recovers table structures -- Extracts metadata from the document, such as title, authors, references and language -- Optionally applies OCR, e.g. for scanned PDFs -- Can be configured to be optimal for batch-mode (i.e high throughput, low time-to-solution) or interactive mode (compromise on efficiency, low time-to-solution) -- Can leverage different accelerators (GPU, MPS, etc). - -## 2 Getting Started - -To use Docling, you can simply install the docling package from PyPI. Documentation and examples are available in our GitHub repository at [github.com/DS4SD/docling](https://github.com/DS4SD/docling) . All required model assets 1 are downloaded to a local huggingface datasets cache on first use, unless you choose to pre-install the model assets in advance. - -Docling provides an easy code interface to convert PDF documents from file system, URLs or binary streams, and retrieve the output in either JSON or Markdown format. For convenience, separate methods are offered to convert single documents or batches of documents. A basic usage example is illustrated below. Further examples are available in the Doclign code repository. - -``` -from docling.document_converter import DocumentConverter Large -``` - -``` -source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL converter = DocumentConverter() result = converter.convert_single(source) print(result.render_as_markdown()) # output: "## DocLayNet: A Human -Annotated Dataset for Document -Layout Analysis [...]" -``` - -Optionally, you can configure custom pipeline features and runtime options, such as turning on or off features (e.g. OCR, table structure recognition), enforcing limits on the input document size, and defining the budget of CPU threads. Advanced usage examples and options are documented in the README file. Docling also provides a Dockerfile to demonstrate how to install and run it inside a container. - -## 3 Processing pipeline - -Docling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieves the programmatic text tokens, consisting of string content and its coordinates on the page, and also renders a bitmap image of each page to support downstream operations. Then, the standard model pipeline applies a sequence of AI models independently on every page in the document to extract features and content, such as layout and table structures. Finally, the results from all pages are aggregated and passed through a post-processing stage, which augments metadata, detects the document language, infers reading-order and eventually assembles a typed document object which can be serialized to JSON or Markdown. - -## 3.1 PDF backends - -Two basic requirements to process PDF documents in our pipeline are a) to retrieve all text content and their geometric coordinates on each page and b) to render the visual representation of each page as it would appear in a PDF viewer. Both these requirements are encapsulated in Docling's PDF backend interface. While there are several open-source PDF parsing libraries available for python, we faced major obstacles with all of them for different reasons, among which were restrictive - -1 see huggingface.co/ds4sd/docling-models/ - In this image, we can see some text and images. -Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible. - -In this image, we can see some text and images. - - - -licensing (e.g. pymupdf [7]), poor speed or unrecoverable quality issues, such as merged text cells across far-apart text tokens or table columns (pypdfium, PyPDF) [15, 14]. - -We therefore decided to provide multiple backend choices, and additionally open-source a custombuilt PDF parser, which is based on the low-level qpdf [4] library. It is made available in a separate package named docling-parse and powers the default PDF backend in Docling. As an alternative, we provide a PDF backend relying on pypdfium , which may be a safe backup choice in certain cases, e.g. if issues are seen with particular font encodings. - -## 3.2 AI models - -As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks. - -## Layout Analysis Model - -Our layout analysis model is an object-detector which predicts the bounding-boxes and classes of various elements on the image of a given page. Its architecture is derived from RT-DETR [16] and re-trained on DocLayNet [13], our popular human-annotated dataset for document-layout analysis, among other proprietary datasets. For inference, our implementation relies on the onnxruntime [5]. - -The Docling pipeline feeds page images at 72 dpi resolution, which can be processed on a single CPU with sub-second latency. All predicted bounding-box proposals for document elements are post-processed to remove overlapping proposals based on confidence and size, and then intersected with the text tokens in the PDF to group them into meaningful and complete units such as paragraphs, section titles, list items, captions, figures or tables. - -## Table Structure Recognition - -The TableFormer model [12], first published in 2022 and since refined with a custom structure token language [9], is a vision-transformer model for table structure recovery. It can predict the logical row and column structure of a given table based on an input image, and determine which table cells belong to column headers, row headers or the table body. Compared to earlier approaches, TableFormer handles many characteristics of tables, such as partial or no borderlines, empty cells, rows or columns, cell spans and hierarchy both on column-heading or row-heading level, tables with inconsistent indentation or alignment and other complexities. For inference, our implementation relies on PyTorch [2]. - -The Docling pipeline feeds all table objects detected in the layout analysis to the TableFormer model, by providing an image-crop of the table and the included text cells. TableFormer structure predictions are matched back to the PDF cells in post-processing to avoid expensive re-transcription text in the table image. Typical tables require between 2 and 6 seconds to be processed on a standard CPU, strongly depending on the amount of included table cells. - -## OCR - -Docling provides optional support for OCR, for example to cover scanned PDFs or content in bitmaps images embedded on a page. In our initial release, we rely on EasyOCR [1], a popular thirdparty OCR library with support for many languages. Docling, by default, feeds a high-resolution page image (216 dpi) to the OCR engine, to allow capturing small print detail in decent quality. While EasyOCR delivers reasonable transcription quality, we observe that it runs fairly slow on CPU (upwards of 30 seconds per page). - -We are actively seeking collaboration from the open-source community to extend Docling with additional OCR backends and speed improvements. - -## 3.3 Assembly - -In the final pipeline stage, Docling assembles all prediction results produced on each page into a well-defined datatype that encapsulates a converted document, as defined in the auxiliary package docling-core . The generated document object is passed through a post-processing model which leverages several algorithms to augment features, such as detection of the document language, correcting the reading order, matching figures with captions and labelling metadata such as title, authors and references. The final output can then be serialized to JSON or transformed into a Markdown representation at the users request. - -## 3.4 Extensibility - -Docling provides a straight-forward interface to extend its capabilities, namely the model pipeline. A model pipeline constitutes the central part in the processing, following initial document parsing and preceding output assembly, and can be fully customized by sub-classing from an abstract baseclass ( BaseModelPipeline ) or cloning the default model pipeline. This effectively allows to fully customize the chain of models, add or replace models, and introduce additional pipeline configuration parameters. To use a custom model pipeline, the custom pipeline class to instantiate can be provided as an argument to the main document conversion methods. We invite everyone in the community to propose additional or alternative models and improvements. - -Implementations of model classes must satisfy the python Callable interface. The \_\_call\_\_ method must accept an iterator over page objects, and produce another iterator over the page objects which were augmented with the additional features predicted by the model, by extending the provided PagePredictions data model accordingly. - -## 4 Performance - -In this section, we establish some reference numbers for the processing speed of Docling and the resource budget it requires. All tests in this section are run with default options on our standard test set distributed with Docling, which consists of three papers from arXiv and two IBM Redbooks, with a total of 225 pages. Measurements were taken using both available PDF backends on two different hardware systems: one MacBook Pro M3 Max, and one bare-metal server running Ubuntu 20.04 LTS on an Intel Xeon E5-2690 CPU. For reproducibility, we fixed the thread budget (through setting OMP NUM THREADS environment variable ) once to 4 (Docling default) and once to 16 (equal to full core count on the test hardware). All results are shown in Table 1. - -If you need to run Docling in very low-resource environments, please consider configuring the pypdfium backend. While it is faster and more memory efficient than the default docling-parse backend, it will come at the expense of worse quality results, especially in table structure recovery. - -Establishing GPU acceleration support for the AI models is currently work-in-progress and largely untested, but may work implicitly when CUDA is available and discovered by the onnxruntime and - torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report. {'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} @@ -139,151 +54,16 @@ We encourage everyone to propose or implement additional features and models, an - [1] J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/ JaidedAI/EasyOCR , 2024. Version: 1.7.0. - [2] J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. Voznesensky, B. Bao, P. Bell, D. Berard, E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison, W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. Lazos, M. Lezcano, Y. Liang, J. Liang, Y. Lu, C. Luk, B. Maher, Y. Pan, C. Puhrsch, M. Reso, M. Saroufim, M. Y. Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang, X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S. Chintala. Pytorch 2: Faster -machine learning through dynamic python bytecode transformation and graph compilation. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2 (ASPLOS '24) . ACM, 4 2024. doi: 10.1145/3620665.3640366. URL https://pytorch.org/assets/pytorch2-2.pdf . - -- [3] C. Auer, M. Dolfi, A. Carvalho, C. B. Ramis, and P. W. Staar. Delivering document conversion as a cloud service with high throughput and responsiveness. In 2022 IEEE 15th International Conference on Cloud Computing (CLOUD) , pages 363-373. IEEE, 2022. -- [4] J. Berkenbilt. Qpdf: A content-preserving pdf document transformer, 2024. URL https: //github.com/qpdf/qpdf . -- [5] O. R. developers. Onnx runtime. https://onnxruntime.ai/ , 2024. Version: 1.18.1. -- [6] IBM. Data Prep Kit: a community project to democratize and accelerate unstructured data preparation for LLM app developers, 2024. URL https://github.com/IBM/ data-prep-kit . -- [7] A. S. Inc. PyMuPDF, 2024. URL https://github.com/pymupdf/PyMuPDF . -- [8] J. Liu. LlamaIndex, 11 2022. URL https://github.com/jerryjliu/llama\_index . -- [9] M. Lysak, A. Nassar, N. Livathinos, C. Auer, and P. Staar. Optimized Table Tokenization for Table Structure Recognition. In Document Analysis and Recognition - ICDAR 2023: 17th International Conference, San Jos´ e, CA, USA, August 21-26, 2023, Proceedings, Part II , pages 37-50, Berlin, Heidelberg, Aug. 2023. Springer-Verlag. ISBN 978-3-031-41678-1. doi: 10. 1007/978-3-031-41679-8 3. URL https://doi.org/10.1007/978-3-031-41679-8\_3 . -- [10] L. Mishra, S. Dhibi, Y. Kim, C. Berrospi Ramis, S. Gupta, M. Dolfi, and P. Staar. Statements: Universal information extraction from tables with large language models for ESG KPIs. In D. Stammbach, J. Ni, T. Schimanski, K. Dutia, A. Singh, J. Bingler, C. Christiaen, N. Kushwaha, V. Muccione, S. A. Vaghefi, and M. Leippold, editors, Proceedings of the 1st Workshop on Natural Language Processing Meets Climate Change (ClimateNLP 2024) , pages 193-214, Bangkok, Thailand, Aug. 2024. Association for Computational Linguistics. URL https://aclanthology.org/2024.climatenlp-1.15 . -- [11] L. Morin, V. Weber, G. I. Meijer, F. Yu, and P. W. J. Staar. Patcid: an open-access dataset of chemical structures in patent documents. Nature Communications , 15(1):6532, August 2024. ISSN 2041-1723. doi: 10.1038/s41467-024-50779-y. URL https://doi.org/10.1038/ s41467-024-50779-y . -- [12] A. Nassar, N. Livathinos, M. Lysak, and P. Staar. Tableformer: Table structure understanding with transformers. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition , pages 4614-4623, 2022. -- [13] B. Pfitzmann, C. Auer, M. Dolfi, A. S. Nassar, and P. Staar. Doclaynet: a large humanannotated dataset for document-layout segmentation. pages 3743-3751, 2022. -- [14] pypdf Maintainers. pypdf: A Pure-Python PDF Library, 2024. URL https://github.com/ py-pdf/pypdf . -- [15] P. Team. PyPDFium2: Python bindings for PDFium, 2024. URL https://github.com/ pypdfium2-team/pypdfium2 . -- [16] Y. Zhao, W. Lv, S. Xu, J. Wei, G. Wang, Q. Dang, Y. Liu, and J. Chen. Detrs beat yolos on real-time object detection, 2023. - -## Appendix - -In this section, we illustrate a few examples of Docling's output in Markdown and JSON. - -## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis - -## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis - -Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com - -Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com - -Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com - -Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com - -Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com - -## ABSTRACT - -Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large ground-truth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis. - -## CCS CONCEPTS - -· Informationsystems → Documentstructure ; · Appliedcomputing → Document analysis ; · Computing methodologies → Machine learning Computer vision ; ; Object detection ; - -Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s). KDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043 - -Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com - -Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com - -Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com - -Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com - -Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com - -## ABSTRACT - -Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large groundtruth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis. - -## CCS CONCEPTS - -Æ Information systems → Document structure ; Æ Applied computing → Document analysis ; Æ Computing methodologies → Machine learning ; Computer vision ; Object detection ; - -Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s). - -KDD '22, August 14-18, 2022, Washington, DC, USA ' 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043 - -Figure 1: Four examples of complex page layouts across different document categories - -## KEYWORDS - -PDF document conversion, layout segmentation, object-detection, data set, Machine Learning - -## ACM Reference Format: - -Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043 - In this image there is a table with some text on it. -In this image there is a table with some text on it. - - - In this image we can see a text. -In this image we can see a text. - - - -AGL Energy Limited ABN 74 1 - -5 061 375 - In this image I can see the cover of the book. -In this image I can see the cover of the book. - - - In this image there is a paper with some text on it. -In this image there is a paper with some text on it. - - - -Figure 1: Four examples of complex page layouts across different document categories - -## KEYWORDS - -PDF document conversion, layout segmentation, object-detection, data set, Machine Learning - -## ACMReference Format: - -Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043 - -1 INTRODUCTION - -Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1. Figure 2: Title page of the DocLayNet paper (arxiv.org/pdf/2206.01062) - left PDF, right rendered Markdown. If recognized, metadata such as authors are appearing first under the title. Text content inside figures is currently dropped, the caption is retained and linked to the figure in the JSON representation (not shown). - -KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar - -Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset. - -| | human | MRCNN R50 R101 | FRCNN R101 | YOLO v5x6 | -|--------------------------------------------------------------------------------------------------------|-------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------|--------------------------------------------------------| -| Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All | 84-89 83-91 83-85 87-88 93-94 85-89 69-71 83-84 77-81 84-86 | 68.4 71.5 70.9 60.1 63.4 81.2 80.8 61.6 59.3 71.9 70.0 71.7 72.7 67.6 69.3 82.2 82.9 85.8 76.7 80.4 72.4 73.5 | 70.1 73.7 63.5 81.0 58.9 72.0 72.0 68.4 82.2 85.4 79.9 73.4 | 77.7 77.2 66.2 86.2 61.1 67.9 74.6 86.3 88.1 82.7 76.8 | - -to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity. - -## 5 EXPERIMENTS - -The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this - In this image, we can see a table with some text. -In this image, we can see a table with some text. - - - -Third, achienec - -## EXPERIMENTS - -chalenongayouls ground-vuth dawa such WC - The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. @@ -294,164 +74,16 @@ The graph has two lines: one for the training program and one for the percentage - **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. - **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%. -The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. - -The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. - -### Analysis: - -#### Training Program: -- **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. -- **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%. - - - -Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions. - -paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work. - -In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16]. - -## Baselines for Object Detection - -In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 × 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document. - -coioct dcochon modols - -## Baselines for Object Detection - -mak enbrel - -Figure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table. - -KDD '22, August 14-18, 2022, Washington, DC, USA - -Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar - -Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % - -between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges. - -of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric - The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A" -The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. - -The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. - -The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A" - - - In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318. -In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318. - - - -| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | -|----------------|---------|--------------|--------------|--------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------| -| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten | -| Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a | -| Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 | -| Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a | -| List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 | -| Page-footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 | -| Page-header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 | -| Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 | -| Section-header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 | -| Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 | -| Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 | -| Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 | -| Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 | - In this image I can see a blue circle. -In this image I can see a blue circle. - - - -include publication repositories such as arXiv - -Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple- - -annotated pages, from which we obtain accuracy ranges. - A table with different columns and rows. -A table with different columns and rows. - - - -| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | -|-----------------|---------|--------------|--------------|--------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------|----------------------------------------------| -| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten | -| Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a | -| Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 | -| Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a | -| List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 | -| Page- footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 | -| Page- header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 | -| Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 | -| Section- header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 | -| Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 | -| Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 | -| Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 | -| Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 | - -3 - -, - -government offices, - -We reviewed the col- - -, - -Page- - -Title and - -. - -page. Specificity ensures that the choice of label is not ambiguous, - In this image there is a text in the middle. - -In this image there is a text in the middle. - - - -we distributed the annotation workload and performed continuous be annotated. We refrained from class labels that are very specific - -only. For phases three and four, a group of 40 dedicated annotators while coverage ensures that all meaningful items on a page can - -quality controls. Phase one and two required a small team of experts to a document category, such as - -Abstract in the - -Scientific Articles were assembled and supervised. - -category. We also avoided class labels that are tightly linked to the - -Phase 1: Data selection and preparation. - -Our inclusion cri- - -Author - -Affiliation - -teria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources in DocBank, are often only distinguishable by discriminating on 3 https://arxiv.org/ Figure 4: Table 1 from the DocLayNet paper in the original PDF (A), as rendered Markdown (B) and in JSON representation (C). Spanning table cells, such as the multi-column header 'triple interannotator mAP@0.5-0.95 (%)', is repeated for each column in the Markdown representation (B), which guarantees that every data point can be traced back to row and column headings only by its grid coordinates in the table. In the JSON representation, the span information is reflected in the fields of each table cell (C). - -semantics of the text. Labels such as and - -, - -as seen diff --git a/test/data/doc/activities.gt.md b/test/data/doc/activities.gt.md index d03e5d7d..0770bf62 100644 --- a/test/data/doc/activities.gt.md +++ b/test/data/doc/activities.gt.md @@ -6,8 +6,6 @@ Duck Figure 1: This is a cute duckling - - ## Let's swim! To get started with swimming, first lay down in a water and try not to drown: diff --git a/test/data/doc/activities_p1.gt.html b/test/data/doc/activities_p1.gt.html index 6b0b9cad..18932e44 100644 --- a/test/data/doc/activities_p1.gt.html +++ b/test/data/doc/activities_p1.gt.html @@ -144,13 +144,6 @@

Let's swim!

Hmm, what else…

  • -Another activity item
  • -
  • -Yet another one
  • -
  • -Stopping it here
  • -
-

Some text.

-
    -
  • -Starting the next page with a list item.
  • -
  • -Second item.
diff --git a/test/data/doc/activities_p2.gt.html b/test/data/doc/activities_p2.gt.html index 6b0b9cad..9ab4b1be 100644 --- a/test/data/doc/activities_p2.gt.html +++ b/test/data/doc/activities_p2.gt.html @@ -124,34 +124,11 @@
-

Summer activities

-

Swimming in the lake

-

Duck

-
Figure 1: This is a cute duckling
-

Let's swim!

-

To get started with swimming, first lay down in a water and try not to drown:

    -
  • ∞ You can relax and look around
  • -
  • ∞ Paddle about
  • -
  • ∞ Enjoy summer warmth
  • -
-

Also, don't forget:

-
    -
  • 1. Wear sunglasses
  • -
  • 2. Don't forget to drink water
  • -
  • 3. Use sun cream
  • -
-

Hmm, what else…

-
    -
  • -Another activity item
  • -Yet another one
  • -Stopping it here

Some text.

-
    -
  • -Starting the next page with a list item.
  • -
  • -Second item.
  • -
diff --git a/test/data/doc/activities_p2.gt.md b/test/data/doc/activities_p2.gt.md index b8910c37..4801d37b 100644 --- a/test/data/doc/activities_p2.gt.md +++ b/test/data/doc/activities_p2.gt.md @@ -1,34 +1,4 @@ -## Summer activities - -## Swimming in the lake - -Duck - -Figure 1: This is a cute duckling - - - -## Let's swim! - -To get started with swimming, first lay down in a water and try not to drown: - -- ∞ You can relax and look around -- ∞ Paddle about -- ∞ Enjoy summer warmth - -Also, don't forget: - -- 1. Wear sunglasses -- 2. Don't forget to drink water -- 3. Use sun cream - -Hmm, what else… - -- -Another activity item - -Yet another one - -Stopping it here Some text. - -- -Starting the next page with a list item. -- -Second item. diff --git a/test/data/doc/activities_pb_empty.gt.md b/test/data/doc/activities_pb_empty.gt.md index 0a1695cd..185578eb 100644 --- a/test/data/doc/activities_pb_empty.gt.md +++ b/test/data/doc/activities_pb_empty.gt.md @@ -6,8 +6,6 @@ Duck Figure 1: This is a cute duckling - - ## Let's swim! To get started with swimming, first lay down in a water and try not to drown: diff --git a/test/data/doc/activities_pb_non_empty.gt.md b/test/data/doc/activities_pb_non_empty.gt.md index e3ca76eb..c134cf71 100644 --- a/test/data/doc/activities_pb_non_empty.gt.md +++ b/test/data/doc/activities_pb_non_empty.gt.md @@ -6,8 +6,6 @@ Duck Figure 1: This is a cute duckling - - ## Let's swim! To get started with swimming, first lay down in a water and try not to drown: diff --git a/test/data/doc/activities_pb_none.gt.md b/test/data/doc/activities_pb_none.gt.md index b8910c37..1e983a54 100644 --- a/test/data/doc/activities_pb_none.gt.md +++ b/test/data/doc/activities_pb_none.gt.md @@ -6,8 +6,6 @@ Duck Figure 1: This is a cute duckling - - ## Let's swim! To get started with swimming, first lay down in a water and try not to drown: diff --git a/test/data/doc/checkboxes.gt.md b/test/data/doc/checkboxes.gt.md index f9114275..e4303929 100644 --- a/test/data/doc/checkboxes.gt.md +++ b/test/data/doc/checkboxes.gt.md @@ -1,7 +1,3 @@ - - - - Security Classification / Classification de sécurité Contract Number / Numéro du contrat @@ -166,8 +162,6 @@ UNCLASSIFIED - - Security Classification / Classification de sécurité Contract Number / Numéro du contrat diff --git a/test/data/doc/cross_page_lists_chunks.json b/test/data/doc/cross_page_lists_chunks.json index 92ad398d..e7abf3c3 100644 --- a/test/data/doc/cross_page_lists_chunks.json +++ b/test/data/doc/cross_page_lists_chunks.json @@ -1,7 +1,7 @@ { "root": [ { - "text": "## DIVERSITY, EQUITY AND INCLUSION\n\nWe are committed to accelerate our efforts around Diversity, Equity, and Inclusion (DE&I) within Neurocrine and in the life sciences community. Our Compensation Committee provides Board oversight of our DE&I program, and our Chief Human Resources Officer has managerial responsibility for our diversity initiatives.\n\nTo help provide advice and guidance on DE&I priorities and initiatives, we have in place a DE&I Council of 11 full-time employees, including our Chief Corporate Affairs Officer as Chair and Executive Sponsor. The DE&I Council oversees priorities and initiatives that support Neurocrine's DE&I strategic framework and goals. Representing different backgrounds and roles from across the company, the DE&I Council meets monthly to discuss what is being actioned on DE&I, examine how it's working, and provide input on what else we should prioritize. As a Biocom California member organization, we are a signatory to their DE&I Member Pledge. Our action supports our commitments under this pledge.\n\nOur multi-faceted DE&I program includes the following initiatives:\n\n- \u2022 Mentorships and internship programs featuring diverse employees and students\n- \u2022 Wylie Vale Neurocrine Biosciences SD2 Scholarship , which focuses on supporting the growth and development of underrepresented collegiate students pursuing a STEM- related degree\n- \u2022 Career watch for high-potential diverse talent\n- \u2022 Build Science, Technology, Engineering and Mathematics (STEM) employee candidate pipeline via involvement with:\n - \u00bb Historically Black Colleges and Universities (HBCUs) site visits and career fairs\n - \u00bb The National Sales Network (NSN), the premier conference for Black sales professionals. Neurocrine has been a gold sponsor of the event and represented at the NSN career fair.\n - \u00bb The Ocean Discovery Institute (nonprofit organization using science to empower young people from underserved urban communities to transform their lives, their community, and our world as scientific and conservation leaders)\n - \u00bb San Diego Squared (STEM-focused nonprofit organization connecting underrepresented student to the power of STEM by providing access to education, mentorship and resources to develop STEM careers)\n- \u2022 Build upon DE&I employee education initiatives including:\n - \u00bb Engaging all employees, including the CEO and Management Committee, in our Unconscious Bias Learning Program, Trust Workshop, and anti-harassment and anti- discrimination training. Our anti-harassment and anti-discrimination trainings are reviewed annually.\n- \u2022 Onsite mothers' room for nursing moms\n- \u2022 Celebration and promotion of widely recognized diversity and inclusion awareness months and days including but not limited to:\n - \u00bb Asian American and Pacific Islander Heritage Month\n - \u00bb Black History Month\n - \u00bb Hispanic Heritage Month\n - \u00bb Juneteenth\n - \u00bb Pride Month\n - \u00bb Women's History Month\n\n16 Neurocrine Biosciences\n\n2024 Corporate Sustainability Report\n\n< Return to ToC >\n\n## Employee resource networks\n\nValuing the broad range of diversity at Neurocrine Biosciences, we recognize the important role that Employee Resource Networks (ERNs) play in creating an inclusive culture where all huge employees can thrive. ERNs are open to all employees to join for support and connection based on common interests, backgrounds, or demographics, promoting a more diverse, equitable, and inclusive workplace. Aimed at being educational and supportive, ERNs align with our overall DE&I strategy.\n\nERNs are supported by an Executive Sponsor and the Director of DE&I and governed by a core leadership team group of 5-6 volunteers, representing the field and corporate office. We currently have an Asian ERN, Black ERN, Christian ERN, disAbility ERN, Hispanic ERN, Young Professionals ERN, and a Women ERN, and we welcome the formation of ERNs for LGBTQIA+ people, veterans, people of all faiths, and other underrepresented groups.\n\n17 Neurocrine Biosciences\n\n2024 Corporate Sustainability Report\n\n< Return to ToC >", + "text": "## DIVERSITY, EQUITY AND INCLUSION\n\nWe are committed to accelerate our efforts around Diversity, Equity, and Inclusion (DE&I) within Neurocrine and in the life sciences community. Our Compensation Committee provides Board oversight of our DE&I program, and our Chief Human Resources Officer has managerial responsibility for our diversity initiatives.\n\nTo help provide advice and guidance on DE&I priorities and initiatives, we have in place a DE&I Council of 11 full-time employees, including our Chief Corporate Affairs Officer as Chair and Executive Sponsor. The DE&I Council oversees priorities and initiatives that support Neurocrine's DE&I strategic framework and goals. Representing different backgrounds and roles from across the company, the DE&I Council meets monthly to discuss what is being actioned on DE&I, examine how it's working, and provide input on what else we should prioritize. As a Biocom California member organization, we are a signatory to their DE&I Member Pledge. Our action supports our commitments under this pledge.\n\nOur multi-faceted DE&I program includes the following initiatives:\n\n- \u2022 Mentorships and internship programs featuring diverse employees and students\n- \u2022 Wylie Vale Neurocrine Biosciences SD2 Scholarship , which focuses on supporting the growth and development of underrepresented collegiate students pursuing a STEM- related degree\n- \u2022 Career watch for high-potential diverse talent\n- \u2022 Build Science, Technology, Engineering and Mathematics (STEM) employee candidate pipeline via involvement with:\n - \u00bb Historically Black Colleges and Universities (HBCUs) site visits and career fairs\n - \u00bb The National Sales Network (NSN), the premier conference for Black sales professionals. Neurocrine has been a gold sponsor of the event and represented at the NSN career fair.\n - \u00bb The Ocean Discovery Institute (nonprofit organization using science to empower young people from underserved urban communities to transform their lives, their community, and our world as scientific and conservation leaders)\n - \u00bb San Diego Squared (STEM-focused nonprofit organization connecting underrepresented student to the power of STEM by providing access to education, mentorship and resources to develop STEM careers)\n\n16 Neurocrine Biosciences\n\n2024 Corporate Sustainability Report\n\n< Return to ToC >", "meta": { "schema_name": "docling_core.transforms.chunker.DocMeta", "version": "1.0.0", @@ -307,789 +307,89 @@ ] }, { - "self_ref": "#/texts/15", - "parent": { - "$ref": "#/groups/0" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [ - { - "page_no": 2, - "bbox": { - "l": 53.0, - "t": 700.675, - "r": 241.03499999999997, - "b": 676.965, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 61 - ] - } - ] - }, - { - "self_ref": "#/texts/16", - "parent": { - "$ref": "#/groups/2" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [ - { - "page_no": 2, - "bbox": { - "l": 71.005, - "t": 665.675, - "r": 294.85, - "b": 574.47, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 264 - ] - } - ] - }, - { - "self_ref": "#/texts/17", - "parent": { - "$ref": "#/groups/0" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [ - { - "page_no": 2, - "bbox": { - "l": 53.0, - "t": 563.1800000000001, - "r": 252.34000000000003, - "b": 552.97, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 40 - ] - } - ] - }, - { - "self_ref": "#/texts/18", - "parent": { - "$ref": "#/groups/0" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [ - { - "page_no": 2, - "bbox": { - "l": 53.0, - "t": 545.685, - "r": 295.29999999999995, - "b": 508.47, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 131 - ] - } - ] - }, - { - "self_ref": "#/texts/19", - "parent": { - "$ref": "#/groups/3" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [ - { - "page_no": 2, - "bbox": { - "l": 71.005, - "t": 497.185, - "r": 254.35000000000002, - "b": 473.475, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 53 - ] - } - ] - }, - { - "self_ref": "#/texts/20", - "parent": { - "$ref": "#/groups/3" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [ - { - "page_no": 2, - "bbox": { - "l": 71.005, - "t": 466.185, - "r": 173.92499999999995, - "b": 455.975, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 21 - ] - } - ] - }, - { - "self_ref": "#/texts/21", - "parent": { - "$ref": "#/groups/3" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [ - { - "page_no": 2, - "bbox": { - "l": 71.005, - "t": 448.69, - "r": 198.11, - "b": 438.475, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 25 - ] - } - ] - }, - { - "self_ref": "#/texts/22", - "parent": { - "$ref": "#/groups/3" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [ - { - "page_no": 2, - "bbox": { - "l": 71.005, - "t": 431.19, - "r": 136.28499999999997, - "b": 420.975, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 12 - ] - } - ] - }, - { - "self_ref": "#/texts/23", - "parent": { - "$ref": "#/groups/3" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [ - { - "page_no": 2, - "bbox": { - "l": 71.005, - "t": 413.69, - "r": 138.40499999999997, - "b": 403.48, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 13 - ] - } - ] - }, - { - "self_ref": "#/texts/24", - "parent": { - "$ref": "#/groups/3" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [ - { - "page_no": 2, - "bbox": { - "l": 71.005, - "t": 396.19, - "r": 192.79999999999995, - "b": 385.98, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 23 - ] - } - ] - }, - { - "self_ref": "#/texts/12", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "page_footer", - "prov": [ - { - "page_no": 1, - "bbox": { - "l": 53.0, - "t": 42.09500000000003, - "r": 153.175, - "b": 34.51999999999998, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 28 - ] - } - ] - }, - { - "self_ref": "#/texts/13", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "page_footer", - "prov": [ - { - "page_no": 1, - "bbox": { - "l": 153.175, - "t": 42.09000000000003, - "r": 279.82, - "b": 34.565000000000055, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 36 - ] - } - ] - }, - { - "self_ref": "#/texts/14", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "page_footer", - "prov": [ - { - "page_no": 1, - "bbox": { - "l": 501.77, - "t": 42.09000000000003, - "r": 559.0, - "b": 34.565000000000055, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 17 - ] - } - ] - }, - { - "self_ref": "#/texts/25", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "section_header", - "prov": [ - { - "page_no": 2, - "bbox": { - "l": 315.0, - "t": 699.765, - "r": 488.70000000000005, - "b": 686.88, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 26 - ] - } - ] - }, - { - "self_ref": "#/texts/26", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 2, - "bbox": { - "l": 315.0, - "t": 680.55, - "r": 556.2550000000001, - "b": 535.345, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 490 - ] - } - ] - }, - { - "self_ref": "#/texts/27", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 2, - "bbox": { - "l": 315.0, - "t": 524.0550000000001, - "r": 561.4000000000001, - "b": 392.35, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 440 - ] - } - ] - }, - { - "self_ref": "#/texts/28", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "page_footer", - "prov": [ - { - "page_no": 2, - "bbox": { - "l": 53.0, - "t": 42.09500000000003, - "r": 152.53499999999997, - "b": 34.51999999999998, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 28 - ] - } - ] - }, - { - "self_ref": "#/texts/29", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "page_footer", - "prov": [ - { - "page_no": 2, - "bbox": { - "l": 152.53499999999997, - "t": 42.09000000000003, - "r": 279.17499999999995, - "b": 34.565000000000055, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 36 - ] - } - ] - }, - { - "self_ref": "#/texts/30", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "page_footer", - "prov": [ - { - "page_no": 2, - "bbox": { - "l": 501.77, - "t": 42.09000000000003, - "r": 559.0, - "b": 34.565000000000055, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 17 - ] - } - ] - } - ] - } - }, - { - "text": "## DIVERSITY, EQUITY AND INCLUSION\n\nWe are committed to accelerate our efforts around Diversity, Equity, and Inclusion (DE&I) within Neurocrine and in the life sciences community. Our Compensation Committee provides Board oversight of our DE&I program, and our Chief Human Resources Officer has managerial responsibility for our diversity initiatives.\n\nTo help provide advice and guidance on DE&I priorities and initiatives, we have in place a DE&I Council of 11 full-time employees, including our Chief Corporate Affairs Officer as Chair and Executive Sponsor. The DE&I Council oversees priorities and initiatives that support Neurocrine's DE&I strategic framework and goals. Representing different backgrounds and roles from across the company, the DE&I Council meets monthly to discuss what is being actioned on DE&I, examine how it's working, and provide input on what else we should prioritize. As a Biocom California member organization, we are a signatory to their DE&I Member Pledge. Our action supports our commitments under this pledge.\n\nOur multi-faceted DE&I program includes the following initiatives:\n\n- \u2022 Mentorships and internship programs featuring diverse employees and students\n- \u2022 Wylie Vale Neurocrine Biosciences SD2 Scholarship , which focuses on supporting the growth and development of underrepresented collegiate students pursuing a STEM- related degree\n- \u2022 Career watch for high-potential diverse talent\n- \u2022 Build Science, Technology, Engineering and Mathematics (STEM) employee candidate pipeline via involvement with:\n - \u00bb Historically Black Colleges and Universities (HBCUs) site visits and career fairs\n - \u00bb The National Sales Network (NSN), the premier conference for Black sales professionals. Neurocrine has been a gold sponsor of the event and represented at the NSN career fair.\n - \u00bb The Ocean Discovery Institute (nonprofit organization using science to empower young people from underserved urban communities to transform their lives, their community, and our world as scientific and conservation leaders)\n - \u00bb San Diego Squared (STEM-focused nonprofit organization connecting underrepresented student to the power of STEM by providing access to education, mentorship and resources to develop STEM careers)\n- \u2022 Build upon DE&I employee education initiatives including:\n - \u00bb Engaging all employees, including the CEO and Management Committee, in our Unconscious Bias Learning Program, Trust Workshop, and anti-harassment and anti- discrimination training. Our anti-harassment and anti-discrimination trainings are reviewed annually.\n- \u2022 Onsite mothers' room for nursing moms\n- \u2022 Celebration and promotion of widely recognized diversity and inclusion awareness months and days including but not limited to:\n - \u00bb Asian American and Pacific Islander Heritage Month\n - \u00bb Black History Month\n - \u00bb Hispanic Heritage Month\n - \u00bb Juneteenth\n - \u00bb Pride Month\n - \u00bb Women's History Month\n\n16 Neurocrine Biosciences\n\n2024 Corporate Sustainability Report\n\n< Return to ToC >\n\n## Employee resource networks\n\nValuing the broad range of diversity at Neurocrine Biosciences, we recognize the important role that Employee Resource Networks (ERNs) play in creating an inclusive culture where all huge employees can thrive. ERNs are open to all employees to join for support and connection based on common interests, backgrounds, or demographics, promoting a more diverse, equitable, and inclusive workplace. Aimed at being educational and supportive, ERNs align with our overall DE&I strategy.\n\nERNs are supported by an Executive Sponsor and the Director of DE&I and governed by a core leadership team group of 5-6 volunteers, representing the field and corporate office. We currently have an Asian ERN, Black ERN, Christian ERN, disAbility ERN, Hispanic ERN, Young Professionals ERN, and a Women ERN, and we welcome the formation of ERNs for LGBTQIA+ people, veterans, people of all faiths, and other underrepresented groups.\n\n17 Neurocrine Biosciences\n\n2024 Corporate Sustainability Report\n\n< Return to ToC >", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/0", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "section_header", - "prov": [ - { - "page_no": 1, - "bbox": { - "l": 53.0, - "t": 688.815, - "r": 319.925, - "b": 673.665, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 31 - ] - } - ] - }, - { - "self_ref": "#/texts/1", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 1, - "bbox": { - "l": 53.0, - "t": 632.15, - "r": 291.99, - "b": 540.94, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 321 - ] - } - ] - }, - { - "self_ref": "#/texts/2", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 1, - "bbox": { - "l": 53.0, - "t": 529.655, - "r": 289.795, - "b": 330.45, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 707 - ] - } - ] - }, - { - "self_ref": "#/texts/3", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 1, - "bbox": { - "l": 315.0, - "t": 632.16, - "r": 535.95, - "b": 608.38, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 67 - ] - } - ] - }, - { - "self_ref": "#/texts/4", - "parent": { - "$ref": "#/groups/0" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [ - { - "page_no": 1, - "bbox": { - "l": 315.0, - "t": 597.15, - "r": 551.98, - "b": 573.44, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 80 - ] - } - ] - }, - { - "self_ref": "#/texts/5", - "parent": { - "$ref": "#/groups/0" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [ - { - "page_no": 1, - "bbox": { - "l": 315.0, - "t": 566.155, - "r": 541.83, - "b": 501.945, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 184 - ] - } - ] - }, - { - "self_ref": "#/texts/6", - "parent": { - "$ref": "#/groups/0" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [ - { - "page_no": 1, - "bbox": { - "l": 315.0, - "t": 494.655, - "r": 543.115, - "b": 484.445, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 49 - ] - } - ] - }, - { - "self_ref": "#/texts/7", - "parent": { - "$ref": "#/groups/0" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [ - { - "page_no": 1, - "bbox": { - "l": 315.0, - "t": 477.155, - "r": 531.59, - "b": 439.945, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 116 - ] - } - ] - }, - { - "self_ref": "#/texts/8", - "parent": { - "$ref": "#/groups/1" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [ - { - "page_no": 1, - "bbox": { - "l": 333.005, - "t": 428.66, - "r": 545.025, - "b": 404.95, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 84 - ] - } - ] - }, - { - "self_ref": "#/texts/9", + "self_ref": "#/texts/12", "parent": { - "$ref": "#/groups/1" + "$ref": "#/body" }, "children": [], "content_layer": "body", - "label": "list_item", + "label": "page_footer", "prov": [ { "page_no": 1, "bbox": { - "l": 333.005, - "t": 397.66, - "r": 561.63, - "b": 346.95, + "l": 53.0, + "t": 42.09500000000003, + "r": 153.175, + "b": 34.51999999999998, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, - 180 + 28 ] } ] }, { - "self_ref": "#/texts/10", + "self_ref": "#/texts/13", "parent": { - "$ref": "#/groups/1" + "$ref": "#/body" }, "children": [], "content_layer": "body", - "label": "list_item", + "label": "page_footer", "prov": [ { "page_no": 1, "bbox": { - "l": 333.005, - "t": 339.665, - "r": 550.675, - "b": 261.95500000000004, + "l": 153.175, + "t": 42.09000000000003, + "r": 279.82, + "b": 34.565000000000055, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, - 231 + 36 ] } ] }, { - "self_ref": "#/texts/11", + "self_ref": "#/texts/14", "parent": { - "$ref": "#/groups/1" + "$ref": "#/body" }, "children": [], "content_layer": "body", - "label": "list_item", + "label": "page_footer", "prov": [ { "page_no": 1, "bbox": { - "l": 333.005, - "t": 254.66499999999996, - "r": 557.23, - "b": 190.46000000000004, + "l": 501.77, + "t": 42.09000000000003, + "r": 559.0, + "b": 34.565000000000055, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, - 202 + 17 ] } ] - }, + } + ] + } + }, + { + "text": "- \u2022 Build upon DE&I employee education initiatives including:\n - \u00bb Engaging all employees, including the CEO and Management Committee, in our Unconscious Bias Learning Program, Trust Workshop, and anti-harassment and anti- discrimination training. Our anti-harassment and anti-discrimination trainings are reviewed annually.\n- \u2022 Onsite mothers' room for nursing moms\n- \u2022 Celebration and promotion of widely recognized diversity and inclusion awareness months and days including but not limited to:\n - \u00bb Asian American and Pacific Islander Heritage Month\n - \u00bb Black History Month\n - \u00bb Hispanic Heritage Month\n - \u00bb Juneteenth\n - \u00bb Pride Month\n - \u00bb Women's History Month\n\n## Employee resource networks\n\nValuing the broad range of diversity at Neurocrine Biosciences, we recognize the important role that Employee Resource Networks (ERNs) play in creating an inclusive culture where all huge employees can thrive. ERNs are open to all employees to join for support and connection based on common interests, backgrounds, or demographics, promoting a more diverse, equitable, and inclusive workplace. Aimed at being educational and supportive, ERNs align with our overall DE&I strategy.\n\nERNs are supported by an Executive Sponsor and the Director of DE&I and governed by a core leadership team group of 5-6 volunteers, representing the field and corporate office. We currently have an Asian ERN, Black ERN, Christian ERN, disAbility ERN, Hispanic ERN, Young Professionals ERN, and a Women ERN, and we welcome the formation of ERNs for LGBTQIA+ people, veterans, people of all faiths, and other underrepresented groups.\n\n17 Neurocrine Biosciences\n\n2024 Corporate Sustainability Report\n\n< Return to ToC >", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ { "self_ref": "#/texts/15", "parent": { @@ -1340,81 +640,6 @@ } ] }, - { - "self_ref": "#/texts/12", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "page_footer", - "prov": [ - { - "page_no": 1, - "bbox": { - "l": 53.0, - "t": 42.09500000000003, - "r": 153.175, - "b": 34.51999999999998, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 28 - ] - } - ] - }, - { - "self_ref": "#/texts/13", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "page_footer", - "prov": [ - { - "page_no": 1, - "bbox": { - "l": 153.175, - "t": 42.09000000000003, - "r": 279.82, - "b": 34.565000000000055, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 36 - ] - } - ] - }, - { - "self_ref": "#/texts/14", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "page_footer", - "prov": [ - { - "page_no": 1, - "bbox": { - "l": 501.77, - "t": 42.09000000000003, - "r": 559.0, - "b": 34.565000000000055, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 17 - ] - } - ] - }, { "self_ref": "#/texts/25", "parent": { From cd496c502618446a86bd0c8a9d1c551c5ca32385 Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Tue, 28 Oct 2025 20:43:37 +0100 Subject: [PATCH 12/22] eliminate serialization dupliation between meta & (legacy) annotations Signed-off-by: Panos Vagenas --- docling_core/transforms/serializer/common.py | 29 +- .../transforms/serializer/markdown.py | 4 +- test/data/chunker/0_out_chunks.json | 5507 ++++------------- test/data/chunker/0b_out_chunks.json | 5507 ++++------------- test/data/doc/2408.09869v3_enriched.gt.md | 2 - ...3_enriched_p1_mark_annotations_false.gt.md | 2 - ...8.09869v3_enriched_p1_mark_meta_true.gt.md | 87 + ...notations_true_mark_annotations_true.gt.md | 49 + test/data/doc/barchart.gt.md | 2 - test/test_serialization.py | 26 +- 10 files changed, 2661 insertions(+), 8554 deletions(-) create mode 100644 test/data/doc/2408.09869v3_enriched_p1_mark_meta_true.gt.md create mode 100644 test/data/doc/2408.09869v3_enriched_p1_use_legacy_annotations_true_mark_annotations_true.gt.md diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py index 288c46c3..86fc120c 100644 --- a/docling_core/transforms/serializer/common.py +++ b/docling_core/transforms/serializer/common.py @@ -12,7 +12,14 @@ from pathlib import Path from typing import Any, Iterable, Optional, Tuple, Union -from pydantic import AnyUrl, BaseModel, ConfigDict, NonNegativeInt, computed_field +from pydantic import ( + AnyUrl, + BaseModel, + ConfigDict, + Field, + NonNegativeInt, + computed_field, +) from typing_extensions import Self, override from docling_core.transforms.serializer.base import ( @@ -198,6 +205,9 @@ class CommonParams(BaseModel): include_formatting: bool = True include_hyperlinks: bool = True caption_delim: str = " " + use_legacy_annotations: bool = Field( + default=False, description="Use legacy annotation serialization." + ) # allowed_meta_names: Optional[set[str]] = Field( # default=None, @@ -444,13 +454,16 @@ def get_parts( else: my_visited.add(node.self_ref) - part = self.serialize_meta( - item=node, - level=lvl, - **kwargs, - ) - if part.text: - parts.append(part) + if not params.use_legacy_annotations and ( + not item or item.self_ref not in self.get_excluded_refs(**kwargs) + ): + part = self.serialize_meta( + item=node, + level=lvl, + **kwargs, + ) + if part.text: + parts.append(part) if params.include_non_meta: part = self.serialize( diff --git a/docling_core/transforms/serializer/markdown.py b/docling_core/transforms/serializer/markdown.py index 52fe3209..45e7b718 100644 --- a/docling_core/transforms/serializer/markdown.py +++ b/docling_core/transforms/serializer/markdown.py @@ -390,7 +390,7 @@ def serialize( if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs): - if params.include_annotations: + if params.use_legacy_annotations and params.include_annotations: ann_res = doc_serializer.serialize_annotations( item=item, @@ -459,7 +459,7 @@ def serialize( res_parts.append(cap_res) if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs): - if params.include_annotations: + if params.use_legacy_annotations and params.include_annotations: ann_res = doc_serializer.serialize_annotations( item=item, **kwargs, diff --git a/test/data/chunker/0_out_chunks.json b/test/data/chunker/0_out_chunks.json index d3630d3a..a32d9912 100644 --- a/test/data/chunker/0_out_chunks.json +++ b/test/data/chunker/0_out_chunks.json @@ -1,45 +1,5 @@ { "root": [ - { - "text": "In this image we can see a cartoon image of a duck holding a paper.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/pictures/0", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "meta": {}, - "label": "picture", - "prov": [ - { - "page_no": 1, - "bbox": { - "l": 261.966552734375, - "t": 715.8966522216797, - "r": 348.65899658203125, - "b": 627.1333770751953, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, { "text": "Version 1.0", "meta": { @@ -853,7 +813,7 @@ } }, { - "text": "Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.\n\nIn this image, we can see some text and images.", + "text": "Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.", "meta": { "schema_name": "docling_core.transforms.chunker.DocMeta", "version": "1.0.0", @@ -882,81 +842,6 @@ ] } ] - }, - { - "self_ref": "#/pictures/1", - "parent": { - "$ref": "#/body" - }, - "children": [ - { - "$ref": "#/texts/31" - }, - { - "$ref": "#/texts/32" - }, - { - "$ref": "#/texts/33" - }, - { - "$ref": "#/texts/34" - }, - { - "$ref": "#/texts/35" - }, - { - "$ref": "#/texts/36" - }, - { - "$ref": "#/texts/37" - }, - { - "$ref": "#/texts/38" - }, - { - "$ref": "#/texts/39" - }, - { - "$ref": "#/texts/40" - }, - { - "$ref": "#/texts/41" - }, - { - "$ref": "#/texts/42" - }, - { - "$ref": "#/texts/43" - }, - { - "$ref": "#/texts/44" - }, - { - "$ref": "#/texts/45" - }, - { - "$ref": "#/texts/46" - } - ], - "content_layer": "body", - "meta": {}, - "label": "picture", - "prov": [ - { - "page_no": 3, - "bbox": { - "l": 110.07231140136719, - "t": 719.2913360595703, - "r": 500.7577209472656, - "b": 581.2926177978516, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] } ], "headings": [ @@ -3261,3964 +3146,1293 @@ } }, { - "text": "In this image there is a table with some text on it.", + "text": "AGL Energy Limited ABN 74 1", "meta": { "schema_name": "docling_core.transforms.chunker.DocMeta", "version": "1.0.0", "doc_items": [ { - "self_ref": "#/pictures/2", + "self_ref": "#/texts/393", "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/129" - }, - { - "$ref": "#/texts/130" - }, - { - "$ref": "#/texts/131" - }, - { - "$ref": "#/texts/132" - }, - { - "$ref": "#/texts/133" - }, - { - "$ref": "#/texts/134" - }, - { - "$ref": "#/texts/135" - }, - { - "$ref": "#/texts/136" - }, - { - "$ref": "#/texts/137" - }, - { - "$ref": "#/texts/138" - }, - { - "$ref": "#/texts/139" - }, - { - "$ref": "#/texts/140" - }, - { - "$ref": "#/texts/141" - }, - { - "$ref": "#/texts/142" - }, - { - "$ref": "#/texts/143" - }, - { - "$ref": "#/texts/144" - }, - { - "$ref": "#/texts/145" - }, - { - "$ref": "#/texts/146" - }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/147" - }, + "page_no": 7, + "bbox": { + "l": 226.786, + "t": 560.516, + "r": 233.176, + "b": 559.937, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 28 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACM Reference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "5 061 375", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/394", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/148" - }, + "page_no": 7, + "bbox": { + "l": 233.40500000000003, + "t": 560.516, + "r": 235.66499999999996, + "b": 559.937, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 9 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACM Reference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Figure 1: Four examples of complex page layouts across different document categories", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/503", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/149" - }, + "page_no": 7, + "bbox": { + "l": 222.539, + "t": 499.2799999999999, + "r": 312.251, + "b": 490.75200000000007, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 84 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACM Reference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "PDF document conversion, layout segmentation, object-detection, data set, Machine Learning", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/505", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/150" - }, - { - "$ref": "#/texts/151" - }, - { - "$ref": "#/texts/152" - }, - { - "$ref": "#/texts/153" - }, - { - "$ref": "#/texts/154" - }, - { - "$ref": "#/texts/155" - }, - { - "$ref": "#/texts/156" - }, - { - "$ref": "#/texts/157" - }, - { - "$ref": "#/texts/158" - }, - { - "$ref": "#/texts/159" - }, - { - "$ref": "#/texts/160" - }, - { - "$ref": "#/texts/161" - }, - { - "$ref": "#/texts/162" - }, - { - "$ref": "#/texts/163" - }, - { - "$ref": "#/texts/164" - }, - { - "$ref": "#/texts/165" - }, - { - "$ref": "#/texts/166" - }, - { - "$ref": "#/texts/167" - }, - { - "$ref": "#/texts/168" - }, - { - "$ref": "#/texts/169" - }, - { - "$ref": "#/texts/170" - }, - { - "$ref": "#/texts/171" - }, - { - "$ref": "#/texts/172" - }, - { - "$ref": "#/texts/173" - }, - { - "$ref": "#/texts/174" - }, - { - "$ref": "#/texts/175" - }, - { - "$ref": "#/texts/176" - }, - { - "$ref": "#/texts/177" - }, - { - "$ref": "#/texts/178" - }, - { - "$ref": "#/texts/179" - }, - { - "$ref": "#/texts/180" - }, - { - "$ref": "#/texts/181" - }, - { - "$ref": "#/texts/182" - }, - { - "$ref": "#/texts/183" - }, - { - "$ref": "#/texts/184" - }, - { - "$ref": "#/texts/185" - }, - { - "$ref": "#/texts/186" - }, - { - "$ref": "#/texts/187" - }, - { - "$ref": "#/texts/188" - }, - { - "$ref": "#/texts/189" - }, - { - "$ref": "#/texts/190" - }, - { - "$ref": "#/texts/191" - }, - { - "$ref": "#/texts/192" - }, - { - "$ref": "#/texts/193" - }, - { - "$ref": "#/texts/194" - }, - { - "$ref": "#/texts/195" - }, - { - "$ref": "#/texts/196" - }, - { - "$ref": "#/texts/197" - }, - { - "$ref": "#/texts/198" - }, - { - "$ref": "#/texts/199" - }, - { - "$ref": "#/texts/200" - }, - { - "$ref": "#/texts/201" - }, - { - "$ref": "#/texts/202" - }, - { - "$ref": "#/texts/203" - }, - { - "$ref": "#/texts/204" - }, - { - "$ref": "#/texts/205" - }, - { - "$ref": "#/texts/206" - }, - { - "$ref": "#/texts/207" - }, - { - "$ref": "#/texts/208" - }, - { - "$ref": "#/texts/209" - }, - { - "$ref": "#/texts/210" - }, - { - "$ref": "#/texts/211" - }, - { - "$ref": "#/texts/212" - }, - { - "$ref": "#/texts/213" - }, - { - "$ref": "#/texts/214" - }, - { - "$ref": "#/texts/215" - }, - { - "$ref": "#/texts/216" - }, - { - "$ref": "#/texts/217" - }, - { - "$ref": "#/texts/218" - }, - { - "$ref": "#/texts/219" - }, - { - "$ref": "#/texts/220" - }, - { - "$ref": "#/texts/221" - }, - { - "$ref": "#/texts/222" - }, - { - "$ref": "#/texts/223" - }, - { - "$ref": "#/texts/224" - }, - { - "$ref": "#/texts/225" - }, - { - "$ref": "#/texts/226" - }, - { - "$ref": "#/texts/227" - }, - { - "$ref": "#/texts/228" - }, - { - "$ref": "#/texts/229" - }, - { - "$ref": "#/texts/230" - }, - { - "$ref": "#/texts/231" - }, - { - "$ref": "#/texts/232" - }, - { - "$ref": "#/texts/233" - }, - { - "$ref": "#/texts/234" - }, - { - "$ref": "#/texts/235" - }, - { - "$ref": "#/texts/236" - }, - { - "$ref": "#/texts/237" - }, - { - "$ref": "#/texts/238" - }, - { - "$ref": "#/texts/239" - }, - { - "$ref": "#/texts/240" - }, - { - "$ref": "#/texts/241" - }, - { - "$ref": "#/texts/242" - }, - { - "$ref": "#/texts/243" - }, - { - "$ref": "#/texts/244" - }, - { - "$ref": "#/texts/245" - }, - { - "$ref": "#/texts/246" - }, - { - "$ref": "#/texts/247" - }, - { - "$ref": "#/texts/248" - }, - { - "$ref": "#/texts/249" - }, - { - "$ref": "#/texts/250" - }, - { - "$ref": "#/texts/251" - }, - { - "$ref": "#/texts/252" - }, - { - "$ref": "#/texts/253" - }, - { - "$ref": "#/texts/254" - }, - { - "$ref": "#/texts/255" - }, - { - "$ref": "#/texts/256" - }, - { - "$ref": "#/texts/257" - }, - { - "$ref": "#/texts/258" - }, - { - "$ref": "#/texts/259" - }, - { - "$ref": "#/texts/260" - }, - { - "$ref": "#/texts/261" - }, - { - "$ref": "#/texts/262" - }, - { - "$ref": "#/texts/263" - }, - { - "$ref": "#/texts/264" - }, - { - "$ref": "#/texts/265" - }, - { - "$ref": "#/texts/266" - }, - { - "$ref": "#/texts/267" - }, - { - "$ref": "#/texts/268" - }, - { - "$ref": "#/texts/269" - }, - { - "$ref": "#/texts/270" - }, - { - "$ref": "#/texts/271" - }, - { - "$ref": "#/texts/272" - }, - { - "$ref": "#/texts/273" - }, - { - "$ref": "#/texts/274" - }, - { - "$ref": "#/texts/275" - }, - { - "$ref": "#/texts/276" - }, - { - "$ref": "#/texts/277" - }, - { - "$ref": "#/texts/278" - }, - { - "$ref": "#/texts/279" - }, - { - "$ref": "#/texts/280" - }, - { - "$ref": "#/texts/281" - }, - { - "$ref": "#/texts/282" - }, - { - "$ref": "#/texts/283" - }, - { - "$ref": "#/texts/284" - }, - { - "$ref": "#/texts/285" - }, - { - "$ref": "#/texts/286" - }, - { - "$ref": "#/texts/287" - }, - { - "$ref": "#/texts/288" - }, - { - "$ref": "#/texts/289" - }, - { - "$ref": "#/texts/290" - }, - { - "$ref": "#/texts/291" - }, - { - "$ref": "#/texts/292" - }, - { - "$ref": "#/texts/293" - }, - { - "$ref": "#/texts/294" - }, - { - "$ref": "#/texts/295" - }, - { - "$ref": "#/texts/296" - }, - { - "$ref": "#/texts/297" - }, - { - "$ref": "#/texts/298" - }, - { - "$ref": "#/texts/299" - }, - { - "$ref": "#/texts/300" - }, - { - "$ref": "#/texts/301" - } - ], - "content_layer": "body", - "meta": {}, - "label": "picture", - "prov": [ - { - "page_no": 7, - "bbox": { - "l": 223.45245361328125, - "t": 606.3411560058594, - "r": 277.1462707519531, - "b": 563.2440032958984, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACM Reference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "In this image we can see a text.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/pictures/3", - "parent": { - "$ref": "#/body" - }, - "children": [ - { - "$ref": "#/texts/302" - }, - { - "$ref": "#/texts/303" - }, - { - "$ref": "#/texts/304" - }, - { - "$ref": "#/texts/305" - }, - { - "$ref": "#/texts/306" - }, - { - "$ref": "#/texts/307" - }, - { - "$ref": "#/texts/308" - }, - { - "$ref": "#/texts/309" - }, - { - "$ref": "#/texts/310" - }, - { - "$ref": "#/texts/311" - }, - { - "$ref": "#/texts/312" - }, - { - "$ref": "#/texts/313" - }, - { - "$ref": "#/texts/314" - }, - { - "$ref": "#/texts/315" - }, - { - "$ref": "#/texts/316" - }, - { - "$ref": "#/texts/317" - }, - { - "$ref": "#/texts/318" - }, - { - "$ref": "#/texts/319" - }, - { - "$ref": "#/texts/320" - }, - { - "$ref": "#/texts/321" - }, - { - "$ref": "#/texts/322" - }, - { - "$ref": "#/texts/323" - }, - { - "$ref": "#/texts/324" - }, - { - "$ref": "#/texts/325" - }, - { - "$ref": "#/texts/326" - }, - { - "$ref": "#/texts/327" - }, - { - "$ref": "#/texts/328" - }, - { - "$ref": "#/texts/329" - }, - { - "$ref": "#/texts/330" - }, - { - "$ref": "#/texts/331" - }, - { - "$ref": "#/texts/332" - }, - { - "$ref": "#/texts/333" - }, - { - "$ref": "#/texts/334" - }, - { - "$ref": "#/texts/335" - }, - { - "$ref": "#/texts/336" - }, - { - "$ref": "#/texts/337" - }, - { - "$ref": "#/texts/338" - }, - { - "$ref": "#/texts/339" - }, - { - "$ref": "#/texts/340" - }, - { - "$ref": "#/texts/341" - }, - { - "$ref": "#/texts/342" - }, - { - "$ref": "#/texts/343" - }, - { - "$ref": "#/texts/344" - }, - { - "$ref": "#/texts/345" - }, - { - "$ref": "#/texts/346" - }, - { - "$ref": "#/texts/347" - }, - { - "$ref": "#/texts/348" - }, - { - "$ref": "#/texts/349" - }, - { - "$ref": "#/texts/350" - }, - { - "$ref": "#/texts/351" - }, - { - "$ref": "#/texts/352" - }, - { - "$ref": "#/texts/353" - }, - { - "$ref": "#/texts/354" - }, - { - "$ref": "#/texts/355" - }, - { - "$ref": "#/texts/356" - }, - { - "$ref": "#/texts/357" - }, - { - "$ref": "#/texts/358" - }, - { - "$ref": "#/texts/359" - }, - { - "$ref": "#/texts/360" - }, - { - "$ref": "#/texts/361" - }, - { - "$ref": "#/texts/362" - }, - { - "$ref": "#/texts/363" - }, - { - "$ref": "#/texts/364" - }, - { - "$ref": "#/texts/365" - }, - { - "$ref": "#/texts/366" - }, - { - "$ref": "#/texts/367" - }, - { - "$ref": "#/texts/368" - }, - { - "$ref": "#/texts/369" - }, - { - "$ref": "#/texts/370" - }, - { - "$ref": "#/texts/371" - }, - { - "$ref": "#/texts/372" - }, - { - "$ref": "#/texts/373" - }, - { - "$ref": "#/texts/374" - }, - { - "$ref": "#/texts/375" - }, - { - "$ref": "#/texts/376" - }, - { - "$ref": "#/texts/377" - }, - { - "$ref": "#/texts/378" - }, - { - "$ref": "#/texts/379" - }, - { - "$ref": "#/texts/380" - }, - { - "$ref": "#/texts/381" - }, - { - "$ref": "#/texts/382" - }, - { - "$ref": "#/texts/383" - }, - { - "$ref": "#/texts/384" - }, - { - "$ref": "#/texts/385" - }, - { - "$ref": "#/texts/386" - }, - { - "$ref": "#/texts/387" - }, - { - "$ref": "#/texts/388" - }, - { - "$ref": "#/texts/389" - }, - { - "$ref": "#/texts/390" - }, - { - "$ref": "#/texts/391" - }, - { - "$ref": "#/texts/392" - } - ], - "content_layer": "body", - "meta": {}, - "label": "picture", - "prov": [ - { - "page_no": 7, - "bbox": { - "l": 279.03204345703125, - "t": 607.0251770019531, - "r": 312.2338562011719, - "b": 562.7499389648438, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACM Reference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "AGL Energy Limited ABN 74 1", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/393", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 7, - "bbox": { - "l": 226.786, - "t": 560.516, - "r": 233.176, - "b": 559.937, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 28 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACM Reference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "5 061 375", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/394", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 7, - "bbox": { - "l": 233.40500000000003, - "t": 560.516, - "r": 235.66499999999996, - "b": 559.937, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 9 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACM Reference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "In this image I can see the text on the image.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/pictures/4", - "parent": { - "$ref": "#/body" - }, - "children": [ - { - "$ref": "#/texts/395" - }, - { - "$ref": "#/texts/396" - }, - { - "$ref": "#/texts/397" - }, - { - "$ref": "#/texts/398" - }, - { - "$ref": "#/texts/399" - }, - { - "$ref": "#/texts/400" - }, - { - "$ref": "#/texts/401" - }, - { - "$ref": "#/texts/402" - }, - { - "$ref": "#/texts/403" - }, - { - "$ref": "#/texts/404" - }, - { - "$ref": "#/texts/405" - }, - { - "$ref": "#/texts/406" - }, - { - "$ref": "#/texts/407" - }, - { - "$ref": "#/texts/408" - }, - { - "$ref": "#/texts/409" - }, - { - "$ref": "#/texts/410" - }, - { - "$ref": "#/texts/411" - }, - { - "$ref": "#/texts/412" - }, - { - "$ref": "#/texts/413" - }, - { - "$ref": "#/texts/414" - }, - { - "$ref": "#/texts/415" - }, - { - "$ref": "#/texts/416" - }, - { - "$ref": "#/texts/417" - }, - { - "$ref": "#/texts/418" - }, - { - "$ref": "#/texts/419" - }, - { - "$ref": "#/texts/420" - }, - { - "$ref": "#/texts/421" - }, - { - "$ref": "#/texts/422" - }, - { - "$ref": "#/texts/423" - }, - { - "$ref": "#/texts/424" - }, - { - "$ref": "#/texts/425" - }, - { - "$ref": "#/texts/426" - }, - { - "$ref": "#/texts/427" - }, - { - "$ref": "#/texts/428" - }, - { - "$ref": "#/texts/429" - }, - { - "$ref": "#/texts/430" - }, - { - "$ref": "#/texts/431" - }, - { - "$ref": "#/texts/432" - }, - { - "$ref": "#/texts/433" - }, - { - "$ref": "#/texts/434" - }, - { - "$ref": "#/texts/435" - }, - { - "$ref": "#/texts/436" - }, - { - "$ref": "#/texts/437" - }, - { - "$ref": "#/texts/438" - }, - { - "$ref": "#/texts/439" - }, - { - "$ref": "#/texts/440" - } - ], - "content_layer": "body", - "meta": {}, - "label": "picture", - "prov": [ - { - "page_no": 7, - "bbox": { - "l": 224.6795196533203, - "t": 559.731201171875, - "r": 268.13018798828125, - "b": 503.4937438964844, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACM Reference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "In this image there is a paper with some text on it.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/pictures/5", - "parent": { - "$ref": "#/body" - }, - "children": [ - { - "$ref": "#/texts/441" - }, - { - "$ref": "#/texts/442" - }, - { - "$ref": "#/texts/443" - }, - { - "$ref": "#/texts/444" - }, - { - "$ref": "#/texts/445" - }, - { - "$ref": "#/texts/446" - }, - { - "$ref": "#/texts/447" - }, - { - "$ref": "#/texts/448" - }, - { - "$ref": "#/texts/449" - }, - { - "$ref": "#/texts/450" - }, - { - "$ref": "#/texts/451" - }, - { - "$ref": "#/texts/452" - }, - { - "$ref": "#/texts/453" - }, - { - "$ref": "#/texts/454" - }, - { - "$ref": "#/texts/455" - }, - { - "$ref": "#/texts/456" - }, - { - "$ref": "#/texts/457" - }, - { - "$ref": "#/texts/458" - }, - { - "$ref": "#/texts/459" - }, - { - "$ref": "#/texts/460" - }, - { - "$ref": "#/texts/461" - }, - { - "$ref": "#/texts/462" - }, - { - "$ref": "#/texts/463" - }, - { - "$ref": "#/texts/464" - }, - { - "$ref": "#/texts/465" - }, - { - "$ref": "#/texts/466" - }, - { - "$ref": "#/texts/467" - }, - { - "$ref": "#/texts/468" - }, - { - "$ref": "#/texts/469" - }, - { - "$ref": "#/texts/470" - }, - { - "$ref": "#/texts/471" - }, - { - "$ref": "#/texts/472" - }, - { - "$ref": "#/texts/473" - }, - { - "$ref": "#/texts/474" - }, - { - "$ref": "#/texts/475" - }, - { - "$ref": "#/texts/476" - }, - { - "$ref": "#/texts/477" - }, - { - "$ref": "#/texts/478" - }, - { - "$ref": "#/texts/479" - }, - { - "$ref": "#/texts/480" - }, - { - "$ref": "#/texts/481" - }, - { - "$ref": "#/texts/482" - }, - { - "$ref": "#/texts/483" - }, - { - "$ref": "#/texts/484" - }, - { - "$ref": "#/texts/485" - }, - { - "$ref": "#/texts/486" - }, - { - "$ref": "#/texts/487" - }, - { - "$ref": "#/texts/488" - }, - { - "$ref": "#/texts/489" - }, - { - "$ref": "#/texts/490" - }, - { - "$ref": "#/texts/491" - }, - { - "$ref": "#/texts/492" - }, - { - "$ref": "#/texts/493" - }, - { - "$ref": "#/texts/494" - }, - { - "$ref": "#/texts/495" - }, - { - "$ref": "#/texts/496" - }, - { - "$ref": "#/texts/497" - }, - { - "$ref": "#/texts/498" - }, - { - "$ref": "#/texts/499" - }, - { - "$ref": "#/texts/500" - }, - { - "$ref": "#/texts/501" - }, - { - "$ref": "#/texts/502" - } - ], - "content_layer": "body", - "meta": {}, - "label": "picture", - "prov": [ - { - "page_no": 7, - "bbox": { - "l": 269.2328186035156, - "t": 558.8644409179688, - "r": 311.74884033203125, - "b": 502.994873046875, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACM Reference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Figure 1: Four examples of complex page layouts across different document categories", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/503", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 7, - "bbox": { - "l": 222.539, - "t": 499.2799999999999, - "r": 312.251, - "b": 490.75200000000007, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 84 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACM Reference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "PDF document conversion, layout segmentation, object-detection, data set, Machine Learning", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/505", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 7, - "bbox": { - "l": 222.539, - "t": 474.62299999999993, - "r": 312.021, - "b": 465.961, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 90 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "KEYWORDS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/507", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 7, - "bbox": { - "l": 222.539, - "t": 458.719, - "r": 312.156, - "b": 436.156, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 374 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "1 INTRODUCTION", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/508", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 7, - "bbox": { - "l": 329.602, - "t": 428.537, - "r": 373.375, - "b": 423.963, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 14 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1. Figure 2: Title page of the DocLayNet paper (arxiv.org/pdf/2206.01062) - left PDF, right rendered Markdown. If recognized, metadata such as authors are appearing first under the title. Text content inside figures is currently dropped, the caption is retained and linked to the figure in the JSON representation (not shown).", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/509", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 7, - "bbox": { - "l": 108.0, - "t": 419.051, - "r": 527.591, - "b": 377.77099999999996, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 1026 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/511", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 122.99899999999998, - "t": 563.105, - "r": 338.603, - "b": 558.655, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 130 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/512", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 122.87200000000001, - "t": 552.103, - "r": 226.37599999999998, - "b": 509.485, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 489 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All, human = 84-89 83-91 83-85 87-88 93-94 85-89 69-71 83-84 77-81 84-86. Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All, MRCNN R50 R101 = 68.4 71.5 70.9 60.1 63.4 81.2 80.8 61.6 59.3 71.9 70.0 71.7 72.7 67.6 69.3 82.2 82.9 85.8 76.7 80.4 72.4 73.5. Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All, FRCNN R101 = 70.1 73.7 63.5 81.0 58.9 72.0 72.0 68.4 82.2 85.4 79.9 73.4. Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All, YOLO v5x6 = 77.7 77.2 66.2 86.2 61.1 67.9 74.6 86.3 88.1 82.7 76.8", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/tables/1", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "table", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 125.8864517211914, - "t": 505.50439453125, - "r": 223.0050506591797, - "b": 437.8017272949219, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/513", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 122.884, - "t": 431.161, - "r": 226.336, - "b": 341.5470000000001, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 1252 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/515", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 122.86499999999998, - "t": 327.581, - "r": 226.282, - "b": 284.81, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 584 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "5 EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "In this image, we can see a table.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/pictures/6", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "meta": {}, - "label": "picture", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 366.8663635253906, - "t": 542.9663391113281, - "r": 460.8086242675781, - "b": 450.9350280761719, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "5 EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Third, achienec", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/516", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 436.0, - "t": 447.0, - "r": 509.66666666666663, - "b": 418.66666666666663, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 15 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "5 EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "chalenongayouls ground-vuth dawa such WC", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/518", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 366.0, - "t": 386.0, - "r": 529.3333333333334, - "b": 375.33333333333337, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 40 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "The image is a line graph that shows the percentage of respondents who have used a specific type of training in the past 24 hours. The x-axis represents the time in hours, while the y-axis represents the percentage of respondents. The graph is titled \"Training.\"\n\nThe graph has two lines: one for the training type and one for the training duration. The training type line is labeled \"Training\" and the training duration line is labeled \"Training Duration.\"\n\nThe graph shows that the percentage of respondents who have used the training type increases as the time increases. Specifically, the percentage of respondents who have used the training type increases from 10% to 20% over the 24-hour period. The percentage of respondents who have used the training duration increases from 10% to 20% over the 24-hour period.\n\nThe graph also shows that the percentage of respondents who have used the training type increases as the", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/pictures/7", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "meta": {}, - "label": "picture", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 237.6404266357422, - "t": 550.1458740234375, - "r": 337.0112609863281, - "b": 477.0093078613281, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/519", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 235.911, - "t": 469.97300000000007, - "r": 339.288, - "b": 441.408, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 322 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/520", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 235.911, - "t": 425.568, - "r": 338.603, - "b": 415.587, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 102 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16].", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/521", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 235.776, - "t": 416.19999999999993, - "r": 338.703, - "b": 382.7970000000001, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 397 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 \u00d7 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/523", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 235.823, - "t": 370.85, - "r": 338.7, - "b": 285.921, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 1146 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "coioct dcochon modols", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/524", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 456.6666666666667, - "t": 344.0, - "r": 485.33333333333337, - "b": 341.33333333333337, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 21 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "mak enbrel", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/526", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 470.6666666666667, - "t": 308.6666666666667, - "r": 524.0, - "b": 285.3333333333333, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 10 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Figure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/527", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 108.0, - "t": 266.424, - "r": 504.00300000000004, - "b": 225.14499999999998, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 393 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "KDD '22, August 14-18, 2022, Washington, DC, USA", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/529", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 88.676, - "t": 598.985, - "r": 186.95, - "b": 593.669, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 48 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/530", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 190.471, - "t": 598.985, - "r": 346.254, - "b": 593.669, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 81 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as %", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/531", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 88.525, - "t": 586.821, - "r": 346.401, - "b": 580.676, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 123 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/532", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 88.676, - "t": 575.628, - "r": 301.135, - "b": 569.484, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 99 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/533", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 88.676, - "t": 581.225, - "r": 346.254, - "b": 575.08, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 124 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "The image consists of a blue circle with the letter \"A\" written in white color. The circle is placed on a white background, which makes the letter \"A\" stand out prominently. The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom. The gradient effect gives a sense of depth and movement, making the letter \"A\" stand out more.\n\n### Description of Objects in the Image:\n1. **Circle**: The central element of the image is a blue circle.\n2. **White Letter \"A\"**: The letter \"A\" is written in white color.\n3. **Background**: The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom.\n4. **Color Gradient**: The gradient effect is present, with the color transitioning from a lighter shade at the top to a darker shade at the bottom.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/pictures/8", - "parent": { - "$ref": "#/body" - }, - "children": [ - { - "$ref": "#/texts/534" - } - ], - "content_layer": "body", - "meta": {}, - "label": "picture", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 110.43017578125, - "t": 573.9806060791016, - "r": 124.71578216552734, - "b": 559.4710540771484, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "In this image, there is a table with two columns. The first column is labeled \"Class label,\" and the second column is labeled \"Count.\" The first row in the table has the label \"Class label,\" and the count is 22524. The second row in the table has the label \"Count,\" and the count is 204.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/pictures/9", - "parent": { - "$ref": "#/body" - }, - "children": [ - { - "$ref": "#/texts/535" - }, - { - "$ref": "#/texts/536" - }, - { - "$ref": "#/texts/537" - }, - { - "$ref": "#/texts/538" - }, - { - "$ref": "#/texts/539" - }, - { - "$ref": "#/texts/540" - }, - { - "$ref": "#/texts/541" - }, - { - "$ref": "#/texts/542" - }, - { - "$ref": "#/texts/543" - }, - { - "$ref": "#/texts/544" - }, - { - "$ref": "#/texts/545" - }, - { - "$ref": "#/texts/546" - }, - { - "$ref": "#/texts/547" - }, - { - "$ref": "#/texts/548" - }, - { - "$ref": "#/texts/549" - }, - { - "$ref": "#/texts/550" - }, - { - "$ref": "#/texts/551" - }, - { - "$ref": "#/texts/552" - }, - { - "$ref": "#/texts/553" - }, - { - "$ref": "#/texts/554" - }, - { - "$ref": "#/texts/555" - }, - { - "$ref": "#/texts/556" - }, - { - "$ref": "#/texts/557" - }, - { - "$ref": "#/texts/558" - }, - { - "$ref": "#/texts/559" - }, - { - "$ref": "#/texts/560" - }, - { - "$ref": "#/texts/561" - }, - { - "$ref": "#/texts/562" - }, - { - "$ref": "#/texts/563" - }, - { - "$ref": "#/texts/564" - }, - { - "$ref": "#/texts/565" - }, - { - "$ref": "#/texts/566" - }, - { - "$ref": "#/texts/567" - }, - { - "$ref": "#/texts/568" - }, - { - "$ref": "#/texts/569" - }, - { - "$ref": "#/texts/570" - }, - { - "$ref": "#/texts/571" - }, - { - "$ref": "#/texts/572" - }, - { - "$ref": "#/texts/573" - }, - { - "$ref": "#/texts/574" - }, - { - "$ref": "#/texts/575" - }, - { - "$ref": "#/texts/576" - }, - { - "$ref": "#/texts/577" - }, - { - "$ref": "#/texts/578" - }, - { - "$ref": "#/texts/579" - }, - { - "$ref": "#/texts/580" - }, - { - "$ref": "#/texts/581" - }, - { - "$ref": "#/texts/582" - }, - { - "$ref": "#/texts/583" - }, - { - "$ref": "#/texts/584" - }, - { - "$ref": "#/texts/585" - }, - { - "$ref": "#/texts/586" - }, - { - "$ref": "#/texts/587" - }, - { - "$ref": "#/texts/588" - }, - { - "$ref": "#/texts/589" - }, - { - "$ref": "#/texts/590" - }, - { - "$ref": "#/texts/591" - }, - { - "$ref": "#/texts/592" - }, - { - "$ref": "#/texts/593" - }, - { - "$ref": "#/texts/594" - }, - { - "$ref": "#/texts/595" - }, - { - "$ref": "#/texts/596" - }, - { - "$ref": "#/texts/597" - }, - { - "$ref": "#/texts/598" - }, - { - "$ref": "#/texts/599" - }, - { - "$ref": "#/texts/600" - }, - { - "$ref": "#/texts/601" - }, - { - "$ref": "#/texts/602" - }, - { - "$ref": "#/texts/603" - }, - { - "$ref": "#/texts/604" - }, - { - "$ref": "#/texts/605" - }, - { - "$ref": "#/texts/606" - }, - { - "$ref": "#/texts/607" - }, - { - "$ref": "#/texts/608" - }, - { - "$ref": "#/texts/609" - }, - { - "$ref": "#/texts/610" - }, - { - "$ref": "#/texts/611" - }, - { - "$ref": "#/texts/612" - }, - { - "$ref": "#/texts/613" - }, - { - "$ref": "#/texts/614" - }, - { - "$ref": "#/texts/615" - }, - { - "$ref": "#/texts/616" - }, - { - "$ref": "#/texts/617" - }, - { - "$ref": "#/texts/618" - }, - { - "$ref": "#/texts/619" - }, - { - "$ref": "#/texts/620" - }, - { - "$ref": "#/texts/621" - }, - { - "$ref": "#/texts/622" - }, - { - "$ref": "#/texts/623" - }, - { - "$ref": "#/texts/624" - }, - { - "$ref": "#/texts/625" - }, - { - "$ref": "#/texts/626" - }, - { - "$ref": "#/texts/627" - }, - { - "$ref": "#/texts/628" - }, - { - "$ref": "#/texts/629" - }, - { - "$ref": "#/texts/630" - }, - { - "$ref": "#/texts/631" - }, - { - "$ref": "#/texts/632" - }, - { - "$ref": "#/texts/633" - }, - { - "$ref": "#/texts/634" - }, - { - "$ref": "#/texts/635" - }, - { - "$ref": "#/texts/636" - }, - { - "$ref": "#/texts/637" - }, - { - "$ref": "#/texts/638" - }, - { - "$ref": "#/texts/639" - }, - { - "$ref": "#/texts/640" - }, - { - "$ref": "#/texts/641" - }, - { - "$ref": "#/texts/642" - }, - { - "$ref": "#/texts/643" - }, - { - "$ref": "#/texts/644" - }, - { - "$ref": "#/texts/645" - }, - { - "$ref": "#/texts/646" - }, - { - "$ref": "#/texts/647" - }, - { - "$ref": "#/texts/648" - }, - { - "$ref": "#/texts/649" - }, - { - "$ref": "#/texts/650" - }, - { - "$ref": "#/texts/651" - }, - { - "$ref": "#/texts/652" - }, - { - "$ref": "#/texts/653" - }, - { - "$ref": "#/texts/654" - }, - { - "$ref": "#/texts/655" - }, - { - "$ref": "#/texts/656" - }, - { - "$ref": "#/texts/657" - }, - { - "$ref": "#/texts/658" - }, - { - "$ref": "#/texts/659" - }, - { - "$ref": "#/texts/660" - }, - { - "$ref": "#/texts/661" - }, - { - "$ref": "#/texts/662" - }, - { - "$ref": "#/texts/663" - }, - { - "$ref": "#/texts/664" - }, - { - "$ref": "#/texts/665" - }, - { - "$ref": "#/texts/666" - }, - { - "$ref": "#/texts/667" - }, - { - "$ref": "#/texts/668" - }, - { - "$ref": "#/texts/669" - }, - { - "$ref": "#/texts/670" - }, - { - "$ref": "#/texts/671" - }, - { - "$ref": "#/texts/672" - }, - { - "$ref": "#/texts/673" - }, - { - "$ref": "#/texts/674" - }, - { - "$ref": "#/texts/675" - }, - { - "$ref": "#/texts/676" - }, - { - "$ref": "#/texts/677" - }, - { - "$ref": "#/texts/678" - }, - { - "$ref": "#/texts/679" - }, - { - "$ref": "#/texts/680" - }, - { - "$ref": "#/texts/681" - }, - { - "$ref": "#/texts/682" - }, - { - "$ref": "#/texts/683" - }, - { - "$ref": "#/texts/684" - }, - { - "$ref": "#/texts/685" - }, - { - "$ref": "#/texts/686" - }, - { - "$ref": "#/texts/687" - }, - { - "$ref": "#/texts/688" - }, - { - "$ref": "#/texts/689" - }, - { - "$ref": "#/texts/690" - }, - { - "$ref": "#/texts/691" - }, - { - "$ref": "#/texts/692" - }, - { - "$ref": "#/texts/693" - } - ], - "content_layer": "body", - "meta": {}, - "label": "picture", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 110.8309097290039, - "t": 560.6356811523438, - "r": 323.92962646484375, - "b": 477.741455078125, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Caption, Count.Count = 22524. Caption, % of Total.Train = 2.04. Caption, % of Total.Test = 1.77. Caption, % of Total.Val = 2.32. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).All = 84-89. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 40-61. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 86-92. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 94-99. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 95-99. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 69-78. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = n/a. Footnote, Count.Count = 6318. Footnote, % of Total.Train = 0.60. Footnote, % of Total.Test = 0.31. Footnote, % of Total.Val = 0.58. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).All = 83-91. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = n/a. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 100. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 62-88. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 85-94. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = n/a. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 82-97. Formula, Count.Count = 25027. Formula, % of Total.Train = 2.25. Formula, % of Total.Test = 1.90. Formula, % of Total.Val = 2.96. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).All = 83-85. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = n/a. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Man = n/a. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 84-87. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 86-96. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = n/a. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = n/a. List-item, Count.Count = 185660. List-item, % of Total.Train = 17.19. List-item, % of Total.Test = 13.34. List-item, % of Total.Val = 15.82. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).All = 87-88. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 74-83. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 90-92. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 97-97. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 81-85. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 75-88. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 93-95. Page-footer, Count.Count = 70878. Page-footer, % of Total.Train = 6.51. Page-footer, % of Total.Test = 5.58. Page-footer, % of Total.Val = 6.00. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).All = 93-94. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 88-90. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 95-96. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 100. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 92-97. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 100. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 96-98. Page-header, Count.Count = 58022. Page-header, % of Total.Train = 5.10. Page-header, % of Total.Test = 6.70. Page-header, % of Total.Val = 5.06. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).All = 85-89. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 66-76. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 90-94. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 98-100. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 91-92. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 97-99. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 81-86. Picture, Count.Count = 45976. Picture, % of Total.Train = 4.21. Picture, % of Total.Test = 2.78. Picture, % of Total.Val = 5.31. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).All = 69-71. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 56-59. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 82-86. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 69-82. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 80-95. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 66-71. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 59-76. Section-header, Count.Count = 142884. Section-header, % of Total.Train = 12.60. Section-header, % of Total.Test = 15.77. Section-header, % of Total.Val = 12.85. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).All = 83-84. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 76-81. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 90-92. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 94-95. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 87-94. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 69-73. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 78-86. Table, Count.Count = 34733. Table, % of Total.Train = 3.20. Table, % of Total.Test = 2.27. Table, % of Total.Val = 3.60. Table, triple inter-annotator mAP @ 0.5-0.95 (%).All = 77-81. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 75-80. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 83-86. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 98-99. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 58-80. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 79-84. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 70-85. Text, Count.Count = 510377. Text, % of Total.Train = 45.82. Text, % of Total.Test = 49.28. Text, % of Total.Val = 45.00. Text, triple inter-annotator mAP @ 0.5-0.95 (%).All = 84-86. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 81-86. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 88-93. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 89-93. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 87-92. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 71-79. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 87-95. Title, Count.Count = 5071. Title, % of Total.Train = 0.47. Title, % of Total.Test = 0.30. Title, % of Total.Val = 0.50. Title, triple inter-annotator mAP @ 0.5-0.95 (%).All = 60-72. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 24-63. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 50-63. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 94-100. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 82-96. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 68-79. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 24-56. Total, Count.Count = 1107470. Total, % of Total.Train = 941123. Total, % of Total.Test = 99816. Total, % of Total.Val = 66531. Total, triple inter-annotator mAP @ 0.5-0.95 (%).All = 82-83. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 71-74. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 79-81. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 89-94. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 86-91. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 71-76. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 68-85", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/tables/3", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "table", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 110.8309097290039, - "t": 560.6356811523438, - "r": 323.92962646484375, - "b": 477.741455078125, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "In this image I can see a blue circle.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/pictures/10", - "parent": { - "$ref": "#/body" - }, - "children": [ - { - "$ref": "#/texts/694" - } - ], - "content_layer": "body", - "meta": {}, - "label": "picture", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 332.130615234375, - "t": 576.3017578125, - "r": 346.93829345703125, - "b": 560.4401550292969, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "include publication repositories such as arXiv", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/695", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 223.57, - "t": 471.407, - "r": 306.847, - "b": 465.079, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 46 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row \"Total\") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/696", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 335.152, - "t": 573.216, - "r": 521.726, - "b": 570.514, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 146 - ] - }, - { - "page_no": 9, - "bbox": { - "l": 335.152, - "t": 573.216, - "r": 521.726, - "b": 570.514, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 147, - 294 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "annotated pages, from which we obtain accuracy ranges.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/697", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 335.152, - "t": 564.097, - "r": 408.543, - "b": 561.395, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 54 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "A table with different columns and rows.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/pictures/11", - "parent": { - "$ref": "#/body" - }, - "children": [ - { - "$ref": "#/texts/698" - }, - { - "$ref": "#/texts/699" - }, - { - "$ref": "#/texts/700" - }, - { - "$ref": "#/texts/701" - }, - { - "$ref": "#/texts/702" - }, - { - "$ref": "#/texts/703" - }, - { - "$ref": "#/texts/704" - }, - { - "$ref": "#/texts/705" - }, - { - "$ref": "#/texts/706" - }, - { - "$ref": "#/texts/707" - }, - { - "$ref": "#/texts/708" - }, - { - "$ref": "#/texts/709" - }, - { - "$ref": "#/texts/710" - }, - { - "$ref": "#/texts/711" - }, - { - "$ref": "#/texts/712" - }, - { - "$ref": "#/texts/713" - }, - { - "$ref": "#/texts/714" - }, - { - "$ref": "#/texts/715" - }, - { - "$ref": "#/texts/716" - }, - { - "$ref": "#/texts/717" - }, - { - "$ref": "#/texts/718" - }, - { - "$ref": "#/texts/719" - }, - { - "$ref": "#/texts/720" - }, - { - "$ref": "#/texts/721" - }, - { - "$ref": "#/texts/722" - }, - { - "$ref": "#/texts/723" - }, - { - "$ref": "#/texts/724" - }, - { - "$ref": "#/texts/725" - }, - { - "$ref": "#/texts/726" - }, - { - "$ref": "#/texts/727" - }, - { - "$ref": "#/texts/728" - }, - { - "$ref": "#/texts/729" - }, - { - "$ref": "#/texts/730" - }, - { - "$ref": "#/texts/731" - }, - { - "$ref": "#/texts/732" - }, - { - "$ref": "#/texts/733" - }, - { - "$ref": "#/texts/734" - }, - { - "$ref": "#/texts/735" - }, - { - "$ref": "#/texts/736" - }, - { - "$ref": "#/texts/737" - }, - { - "$ref": "#/texts/738" - }, - { - "$ref": "#/texts/739" - }, - { - "$ref": "#/texts/740" - }, - { - "$ref": "#/texts/741" - }, - { - "$ref": "#/texts/742" - }, - { - "$ref": "#/texts/743" - }, - { - "$ref": "#/texts/744" - }, - { - "$ref": "#/texts/745" - }, - { - "$ref": "#/texts/746" - }, - { - "$ref": "#/texts/747" - }, - { - "$ref": "#/texts/748" - }, - { - "$ref": "#/texts/749" - }, - { - "$ref": "#/texts/750" - }, - { - "$ref": "#/texts/751" - }, - { - "$ref": "#/texts/752" - }, - { - "$ref": "#/texts/753" - }, - { - "$ref": "#/texts/754" - }, - { - "$ref": "#/texts/755" - }, - { - "$ref": "#/texts/756" - }, - { - "$ref": "#/texts/757" - }, - { - "$ref": "#/texts/758" - }, - { - "$ref": "#/texts/759" - }, - { - "$ref": "#/texts/760" - }, - { - "$ref": "#/texts/761" - }, - { - "$ref": "#/texts/762" - }, - { - "$ref": "#/texts/763" - }, - { - "$ref": "#/texts/764" - }, - { - "$ref": "#/texts/765" - }, - { - "$ref": "#/texts/766" - }, - { - "$ref": "#/texts/767" - }, - { - "$ref": "#/texts/768" - }, - { - "$ref": "#/texts/769" - }, - { - "$ref": "#/texts/770" - }, - { - "$ref": "#/texts/771" - }, - { - "$ref": "#/texts/772" - }, - { - "$ref": "#/texts/773" - }, - { - "$ref": "#/texts/774" - }, - { - "$ref": "#/texts/775" - }, - { - "$ref": "#/texts/776" - }, - { - "$ref": "#/texts/777" - }, - { - "$ref": "#/texts/778" - }, - { - "$ref": "#/texts/779" - }, - { - "$ref": "#/texts/780" - }, - { - "$ref": "#/texts/781" - }, - { - "$ref": "#/texts/782" - }, - { - "$ref": "#/texts/783" - }, - { - "$ref": "#/texts/784" - }, - { - "$ref": "#/texts/785" - }, - { - "$ref": "#/texts/786" - }, - { - "$ref": "#/texts/787" - }, - { - "$ref": "#/texts/788" - }, - { - "$ref": "#/texts/789" - }, - { - "$ref": "#/texts/790" - }, - { - "$ref": "#/texts/791" - }, - { - "$ref": "#/texts/792" - }, - { - "$ref": "#/texts/793" - }, - { - "$ref": "#/texts/794" - }, - { - "$ref": "#/texts/795" - }, - { - "$ref": "#/texts/796" - }, - { - "$ref": "#/texts/797" - }, - { - "$ref": "#/texts/798" - }, - { - "$ref": "#/texts/799" - }, - { - "$ref": "#/texts/800" - }, - { - "$ref": "#/texts/801" - }, - { - "$ref": "#/texts/802" - }, - { - "$ref": "#/texts/803" - }, - { - "$ref": "#/texts/804" - }, - { - "$ref": "#/texts/805" - }, - { - "$ref": "#/texts/806" - }, - { - "$ref": "#/texts/807" - }, - { - "$ref": "#/texts/808" - }, - { - "$ref": "#/texts/809" - }, - { - "$ref": "#/texts/810" - }, - { - "$ref": "#/texts/811" - }, - { - "$ref": "#/texts/812" - }, - { - "$ref": "#/texts/813" - }, - { - "$ref": "#/texts/814" - }, - { - "$ref": "#/texts/815" - }, - { - "$ref": "#/texts/816" - }, - { - "$ref": "#/texts/817" - }, - { - "$ref": "#/texts/818" - }, - { - "$ref": "#/texts/819" - }, - { - "$ref": "#/texts/820" - }, - { - "$ref": "#/texts/821" - }, - { - "$ref": "#/texts/822" - }, - { - "$ref": "#/texts/823" - }, - { - "$ref": "#/texts/824" - }, - { - "$ref": "#/texts/825" - }, - { - "$ref": "#/texts/826" - }, - { - "$ref": "#/texts/827" - }, - { - "$ref": "#/texts/828" - }, - { - "$ref": "#/texts/829" - }, - { - "$ref": "#/texts/830" - }, - { - "$ref": "#/texts/831" - }, - { - "$ref": "#/texts/832" - }, - { - "$ref": "#/texts/833" - }, - { - "$ref": "#/texts/834" - }, - { - "$ref": "#/texts/835" - }, - { - "$ref": "#/texts/836" - }, - { - "$ref": "#/texts/837" - }, - { - "$ref": "#/texts/838" - }, - { - "$ref": "#/texts/839" - }, - { - "$ref": "#/texts/840" - }, - { - "$ref": "#/texts/841" - }, - { - "$ref": "#/texts/842" - }, - { - "$ref": "#/texts/843" - }, - { - "$ref": "#/texts/844" - }, - { - "$ref": "#/texts/845" - }, - { - "$ref": "#/texts/846" - }, - { - "$ref": "#/texts/847" - }, - { - "$ref": "#/texts/848" - }, - { - "$ref": "#/texts/849" - }, - { - "$ref": "#/texts/850" - }, - { - "$ref": "#/texts/851" - }, - { - "$ref": "#/texts/852" - }, - { - "$ref": "#/texts/853" - }, - { - "$ref": "#/texts/854" - }, - { - "$ref": "#/texts/855" - }, - { - "$ref": "#/texts/856" - }, - { - "$ref": "#/texts/857" - }, - { - "$ref": "#/texts/858" - }, - { - "$ref": "#/texts/859" - }, - { - "$ref": "#/texts/860" - }, - { - "$ref": "#/texts/861" - }, - { - "$ref": "#/texts/862" - }, - { - "$ref": "#/texts/863" - }, - { - "$ref": "#/texts/864" - }, - { - "$ref": "#/texts/865" - }, - { - "$ref": "#/texts/866" - }, - { - "$ref": "#/texts/867" - }, - { - "$ref": "#/texts/868" - }, - { - "$ref": "#/texts/869" - }, - { - "$ref": "#/texts/870" - }, - { - "$ref": "#/texts/871" - }, - { - "$ref": "#/texts/872" - }, - { - "$ref": "#/texts/873" - }, - { - "$ref": "#/texts/874" - }, - { - "$ref": "#/texts/875" - }, - { - "$ref": "#/texts/876" - }, - { - "$ref": "#/texts/877" - }, - { - "$ref": "#/texts/878" - }, - { - "$ref": "#/texts/879" - }, + "page_no": 7, + "bbox": { + "l": 222.539, + "t": 474.62299999999993, + "r": 312.021, + "b": 465.961, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 90 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "KEYWORDS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/507", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/880" - }, + "page_no": 7, + "bbox": { + "l": 222.539, + "t": 458.719, + "r": 312.156, + "b": 436.156, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 374 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "1 INTRODUCTION", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/508", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/881" - }, + "page_no": 7, + "bbox": { + "l": 329.602, + "t": 428.537, + "r": 373.375, + "b": 423.963, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 14 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1. Figure 2: Title page of the DocLayNet paper (arxiv.org/pdf/2206.01062) - left PDF, right rendered Markdown. If recognized, metadata such as authors are appearing first under the title. Text content inside figures is currently dropped, the caption is retained and linked to the figure in the JSON representation (not shown).", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/509", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/882" - }, + "page_no": 7, + "bbox": { + "l": 108.0, + "t": 419.051, + "r": 527.591, + "b": 377.77099999999996, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 1026 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/511", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/883" - }, + "page_no": 8, + "bbox": { + "l": 122.99899999999998, + "t": 563.105, + "r": 338.603, + "b": 558.655, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 130 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/512", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/884" - }, + "page_no": 8, + "bbox": { + "l": 122.87200000000001, + "t": 552.103, + "r": 226.37599999999998, + "b": 509.485, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 489 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All, human = 84-89 83-91 83-85 87-88 93-94 85-89 69-71 83-84 77-81 84-86. Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All, MRCNN R50 R101 = 68.4 71.5 70.9 60.1 63.4 81.2 80.8 61.6 59.3 71.9 70.0 71.7 72.7 67.6 69.3 82.2 82.9 85.8 76.7 80.4 72.4 73.5. Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All, FRCNN R101 = 70.1 73.7 63.5 81.0 58.9 72.0 72.0 68.4 82.2 85.4 79.9 73.4. Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All, YOLO v5x6 = 77.7 77.2 66.2 86.2 61.1 67.9 74.6 86.3 88.1 82.7 76.8", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/tables/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [ { - "$ref": "#/texts/885" - }, + "page_no": 8, + "bbox": { + "l": 125.8864517211914, + "t": 505.50439453125, + "r": 223.0050506591797, + "b": 437.8017272949219, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/513", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/886" - }, + "page_no": 8, + "bbox": { + "l": 122.884, + "t": 431.161, + "r": 226.336, + "b": 341.5470000000001, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 1252 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/515", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/887" - }, + "page_no": 8, + "bbox": { + "l": 122.86499999999998, + "t": 327.581, + "r": 226.282, + "b": 284.81, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 584 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "5 EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Third, achienec", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/516", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/888" - }, + "page_no": 8, + "bbox": { + "l": 436.0, + "t": 447.0, + "r": 509.66666666666663, + "b": 418.66666666666663, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 15 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "5 EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "chalenongayouls ground-vuth dawa such WC", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/518", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/889" - }, + "page_no": 8, + "bbox": { + "l": 366.0, + "t": 386.0, + "r": 529.3333333333334, + "b": 375.33333333333337, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 40 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/519", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/890" - }, + "page_no": 8, + "bbox": { + "l": 235.911, + "t": 469.97300000000007, + "r": 339.288, + "b": 441.408, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 322 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/520", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/891" - }, + "page_no": 8, + "bbox": { + "l": 235.911, + "t": 425.568, + "r": 338.603, + "b": 415.587, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 102 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16].", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/521", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/892" - }, + "page_no": 8, + "bbox": { + "l": 235.776, + "t": 416.19999999999993, + "r": 338.703, + "b": 382.7970000000001, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 397 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 \u00d7 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/523", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/893" - }, + "page_no": 8, + "bbox": { + "l": 235.823, + "t": 370.85, + "r": 338.7, + "b": 285.921, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 1146 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "coioct dcochon modols", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/524", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/894" - }, + "page_no": 8, + "bbox": { + "l": 456.6666666666667, + "t": 344.0, + "r": 485.33333333333337, + "b": 341.33333333333337, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 21 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "mak enbrel", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/526", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/895" - }, + "page_no": 8, + "bbox": { + "l": 470.6666666666667, + "t": 308.6666666666667, + "r": 524.0, + "b": 285.3333333333333, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 10 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Figure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/527", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/896" - }, + "page_no": 8, + "bbox": { + "l": 108.0, + "t": 266.424, + "r": 504.00300000000004, + "b": 225.14499999999998, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 393 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "KDD '22, August 14-18, 2022, Washington, DC, USA", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/529", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/897" - }, + "page_no": 9, + "bbox": { + "l": 88.676, + "t": 598.985, + "r": 186.95, + "b": 593.669, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 48 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/530", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/898" - }, + "page_no": 9, + "bbox": { + "l": 190.471, + "t": 598.985, + "r": 346.254, + "b": 593.669, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 81 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as %", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/531", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/899" - }, + "page_no": 9, + "bbox": { + "l": 88.525, + "t": 586.821, + "r": 346.401, + "b": 580.676, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 123 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/532", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/900" - }, + "page_no": 9, + "bbox": { + "l": 88.676, + "t": 575.628, + "r": 301.135, + "b": 569.484, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 99 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/533", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/901" - }, + "page_no": 9, + "bbox": { + "l": 88.676, + "t": 581.225, + "r": 346.254, + "b": 575.08, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 124 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Caption, Count.Count = 22524. Caption, % of Total.Train = 2.04. Caption, % of Total.Test = 1.77. Caption, % of Total.Val = 2.32. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).All = 84-89. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 40-61. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 86-92. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 94-99. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 95-99. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 69-78. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = n/a. Footnote, Count.Count = 6318. Footnote, % of Total.Train = 0.60. Footnote, % of Total.Test = 0.31. Footnote, % of Total.Val = 0.58. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).All = 83-91. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = n/a. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 100. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 62-88. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 85-94. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = n/a. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 82-97. Formula, Count.Count = 25027. Formula, % of Total.Train = 2.25. Formula, % of Total.Test = 1.90. Formula, % of Total.Val = 2.96. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).All = 83-85. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = n/a. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Man = n/a. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 84-87. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 86-96. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = n/a. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = n/a. List-item, Count.Count = 185660. List-item, % of Total.Train = 17.19. List-item, % of Total.Test = 13.34. List-item, % of Total.Val = 15.82. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).All = 87-88. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 74-83. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 90-92. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 97-97. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 81-85. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 75-88. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 93-95. Page-footer, Count.Count = 70878. Page-footer, % of Total.Train = 6.51. Page-footer, % of Total.Test = 5.58. Page-footer, % of Total.Val = 6.00. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).All = 93-94. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 88-90. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 95-96. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 100. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 92-97. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 100. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 96-98. Page-header, Count.Count = 58022. Page-header, % of Total.Train = 5.10. Page-header, % of Total.Test = 6.70. Page-header, % of Total.Val = 5.06. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).All = 85-89. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 66-76. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 90-94. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 98-100. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 91-92. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 97-99. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 81-86. Picture, Count.Count = 45976. Picture, % of Total.Train = 4.21. Picture, % of Total.Test = 2.78. Picture, % of Total.Val = 5.31. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).All = 69-71. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 56-59. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 82-86. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 69-82. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 80-95. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 66-71. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 59-76. Section-header, Count.Count = 142884. Section-header, % of Total.Train = 12.60. Section-header, % of Total.Test = 15.77. Section-header, % of Total.Val = 12.85. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).All = 83-84. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 76-81. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 90-92. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 94-95. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 87-94. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 69-73. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 78-86. Table, Count.Count = 34733. Table, % of Total.Train = 3.20. Table, % of Total.Test = 2.27. Table, % of Total.Val = 3.60. Table, triple inter-annotator mAP @ 0.5-0.95 (%).All = 77-81. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 75-80. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 83-86. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 98-99. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 58-80. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 79-84. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 70-85. Text, Count.Count = 510377. Text, % of Total.Train = 45.82. Text, % of Total.Test = 49.28. Text, % of Total.Val = 45.00. Text, triple inter-annotator mAP @ 0.5-0.95 (%).All = 84-86. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 81-86. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 88-93. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 89-93. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 87-92. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 71-79. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 87-95. Title, Count.Count = 5071. Title, % of Total.Train = 0.47. Title, % of Total.Test = 0.30. Title, % of Total.Val = 0.50. Title, triple inter-annotator mAP @ 0.5-0.95 (%).All = 60-72. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 24-63. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 50-63. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 94-100. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 82-96. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 68-79. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 24-56. Total, Count.Count = 1107470. Total, % of Total.Train = 941123. Total, % of Total.Test = 99816. Total, % of Total.Val = 66531. Total, triple inter-annotator mAP @ 0.5-0.95 (%).All = 82-83. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 71-74. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 79-81. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 89-94. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 86-91. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 71-76. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 68-85", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/tables/3", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [ { - "$ref": "#/texts/902" - }, + "page_no": 9, + "bbox": { + "l": 110.8309097290039, + "t": 560.6356811523438, + "r": 323.92962646484375, + "b": 477.741455078125, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "include publication repositories such as arXiv", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/695", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/903" - }, + "page_no": 9, + "bbox": { + "l": 223.57, + "t": 471.407, + "r": 306.847, + "b": 465.079, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 46 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row \"Total\") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/696", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/904" + "page_no": 9, + "bbox": { + "l": 335.152, + "t": 573.216, + "r": 521.726, + "b": 570.514, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 146 + ] }, { - "$ref": "#/texts/905" + "page_no": 9, + "bbox": { + "l": 335.152, + "t": 573.216, + "r": 521.726, + "b": 570.514, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 147, + 294 + ] } - ], + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "annotated pages, from which we obtain accuracy ranges.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/697", + "parent": { + "$ref": "#/body" + }, + "children": [], "content_layer": "body", - "meta": {}, - "label": "picture", + "label": "text", "prov": [ { "page_no": 9, "bbox": { - "l": 334.4932861328125, - "t": 558.5665130615234, - "r": 544.7938842773438, - "b": 414.31744384765625, + "l": 335.152, + "t": 564.097, + "r": 408.543, + "b": 561.395, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, - 0 + 54 ] } ] @@ -7679,249 +4893,6 @@ } } }, - { - "text": "In this image there is a table with some text on it.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/pictures/12", - "parent": { - "$ref": "#/body" - }, - "children": [ - { - "$ref": "#/texts/915" - }, - { - "$ref": "#/texts/916" - }, - { - "$ref": "#/texts/917" - }, - { - "$ref": "#/texts/918" - }, - { - "$ref": "#/texts/919" - }, - { - "$ref": "#/texts/920" - }, - { - "$ref": "#/texts/921" - }, - { - "$ref": "#/texts/922" - }, - { - "$ref": "#/texts/923" - }, - { - "$ref": "#/texts/924" - }, - { - "$ref": "#/texts/925" - }, - { - "$ref": "#/texts/926" - }, - { - "$ref": "#/texts/927" - }, - { - "$ref": "#/texts/928" - }, - { - "$ref": "#/texts/929" - }, - { - "$ref": "#/texts/930" - }, - { - "$ref": "#/texts/931" - }, - { - "$ref": "#/texts/932" - }, - { - "$ref": "#/texts/933" - }, - { - "$ref": "#/texts/934" - }, - { - "$ref": "#/texts/935" - }, - { - "$ref": "#/texts/936" - }, - { - "$ref": "#/texts/937" - }, - { - "$ref": "#/texts/938" - }, - { - "$ref": "#/texts/939" - }, - { - "$ref": "#/texts/940" - }, - { - "$ref": "#/texts/941" - }, - { - "$ref": "#/texts/942" - }, - { - "$ref": "#/texts/943" - }, - { - "$ref": "#/texts/944" - }, - { - "$ref": "#/texts/945" - }, - { - "$ref": "#/texts/946" - }, - { - "$ref": "#/texts/947" - }, - { - "$ref": "#/texts/948" - }, - { - "$ref": "#/texts/949" - }, - { - "$ref": "#/texts/950" - }, - { - "$ref": "#/texts/951" - }, - { - "$ref": "#/texts/952" - }, - { - "$ref": "#/texts/953" - }, - { - "$ref": "#/texts/954" - }, - { - "$ref": "#/texts/955" - }, - { - "$ref": "#/texts/956" - }, - { - "$ref": "#/texts/957" - }, - { - "$ref": "#/texts/958" - }, - { - "$ref": "#/texts/959" - }, - { - "$ref": "#/texts/960" - }, - { - "$ref": "#/texts/961" - }, - { - "$ref": "#/texts/962" - }, - { - "$ref": "#/texts/963" - }, - { - "$ref": "#/texts/964" - }, - { - "$ref": "#/texts/965" - }, - { - "$ref": "#/texts/966" - }, - { - "$ref": "#/texts/967" - }, - { - "$ref": "#/texts/968" - }, - { - "$ref": "#/texts/969" - }, - { - "$ref": "#/texts/970" - }, - { - "$ref": "#/texts/971" - }, - { - "$ref": "#/texts/972" - }, - { - "$ref": "#/texts/973" - }, - { - "$ref": "#/texts/974" - }, - { - "$ref": "#/texts/975" - }, - { - "$ref": "#/texts/976" - }, - { - "$ref": "#/texts/977" - }, - { - "$ref": "#/texts/978" - }, - { - "$ref": "#/texts/979" - }, - { - "$ref": "#/texts/980" - } - ], - "content_layer": "body", - "meta": {}, - "label": "picture", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 108.79005432128906, - "t": 467.1181335449219, - "r": 329.1195068359375, - "b": 308.97198486328125, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, { "text": "we distributed the annotation workload and performed continuous be annotated. We refrained from class labels that are very specific", "meta": { diff --git a/test/data/chunker/0b_out_chunks.json b/test/data/chunker/0b_out_chunks.json index 6620ef1e..87597d93 100644 --- a/test/data/chunker/0b_out_chunks.json +++ b/test/data/chunker/0b_out_chunks.json @@ -1,45 +1,5 @@ { "root": [ - { - "text": "In this image we can see a cartoon image of a duck holding a paper.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/pictures/0", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "meta": {}, - "label": "picture", - "prov": [ - { - "page_no": 1, - "bbox": { - "l": 261.966552734375, - "t": 715.8966522216797, - "r": 348.65899658203125, - "b": 627.1333770751953, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, { "text": "Version 1.0", "meta": { @@ -853,7 +813,7 @@ } }, { - "text": "Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.\n\nIn this image, we can see some text and images.", + "text": "Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.", "meta": { "schema_name": "docling_core.transforms.chunker.DocMeta", "version": "1.0.0", @@ -882,81 +842,6 @@ ] } ] - }, - { - "self_ref": "#/pictures/1", - "parent": { - "$ref": "#/body" - }, - "children": [ - { - "$ref": "#/texts/31" - }, - { - "$ref": "#/texts/32" - }, - { - "$ref": "#/texts/33" - }, - { - "$ref": "#/texts/34" - }, - { - "$ref": "#/texts/35" - }, - { - "$ref": "#/texts/36" - }, - { - "$ref": "#/texts/37" - }, - { - "$ref": "#/texts/38" - }, - { - "$ref": "#/texts/39" - }, - { - "$ref": "#/texts/40" - }, - { - "$ref": "#/texts/41" - }, - { - "$ref": "#/texts/42" - }, - { - "$ref": "#/texts/43" - }, - { - "$ref": "#/texts/44" - }, - { - "$ref": "#/texts/45" - }, - { - "$ref": "#/texts/46" - } - ], - "content_layer": "body", - "meta": {}, - "label": "picture", - "prov": [ - { - "page_no": 3, - "bbox": { - "l": 110.07231140136719, - "t": 719.2913360595703, - "r": 500.7577209472656, - "b": 581.2926177978516, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] } ], "headings": [ @@ -3261,3964 +3146,1293 @@ } }, { - "text": "In this image there is a table with some text on it.", + "text": "AGL Energy Limited ABN 74 1", "meta": { "schema_name": "docling_core.transforms.chunker.DocMeta", "version": "1.0.0", "doc_items": [ { - "self_ref": "#/pictures/2", + "self_ref": "#/texts/393", "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/129" - }, - { - "$ref": "#/texts/130" - }, - { - "$ref": "#/texts/131" - }, - { - "$ref": "#/texts/132" - }, - { - "$ref": "#/texts/133" - }, - { - "$ref": "#/texts/134" - }, - { - "$ref": "#/texts/135" - }, - { - "$ref": "#/texts/136" - }, - { - "$ref": "#/texts/137" - }, - { - "$ref": "#/texts/138" - }, - { - "$ref": "#/texts/139" - }, - { - "$ref": "#/texts/140" - }, - { - "$ref": "#/texts/141" - }, - { - "$ref": "#/texts/142" - }, - { - "$ref": "#/texts/143" - }, - { - "$ref": "#/texts/144" - }, - { - "$ref": "#/texts/145" - }, - { - "$ref": "#/texts/146" - }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/147" - }, + "page_no": 7, + "bbox": { + "l": 226.786, + "t": 560.516, + "r": 233.176, + "b": 559.937, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 28 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACM Reference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "5 061 375", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/394", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/148" - }, + "page_no": 7, + "bbox": { + "l": 233.40500000000003, + "t": 560.516, + "r": 235.66499999999996, + "b": 559.937, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 9 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACM Reference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Figure 1: Four examples of complex page layouts across different document categories", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/503", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/149" - }, + "page_no": 7, + "bbox": { + "l": 222.539, + "t": 499.2799999999999, + "r": 312.251, + "b": 490.75200000000007, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 84 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACM Reference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "PDF document conversion, layout segmentation, object-detection, data set, Machine Learning", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/505", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/150" - }, - { - "$ref": "#/texts/151" - }, - { - "$ref": "#/texts/152" - }, - { - "$ref": "#/texts/153" - }, - { - "$ref": "#/texts/154" - }, - { - "$ref": "#/texts/155" - }, - { - "$ref": "#/texts/156" - }, - { - "$ref": "#/texts/157" - }, - { - "$ref": "#/texts/158" - }, - { - "$ref": "#/texts/159" - }, - { - "$ref": "#/texts/160" - }, - { - "$ref": "#/texts/161" - }, - { - "$ref": "#/texts/162" - }, - { - "$ref": "#/texts/163" - }, - { - "$ref": "#/texts/164" - }, - { - "$ref": "#/texts/165" - }, - { - "$ref": "#/texts/166" - }, - { - "$ref": "#/texts/167" - }, - { - "$ref": "#/texts/168" - }, - { - "$ref": "#/texts/169" - }, - { - "$ref": "#/texts/170" - }, - { - "$ref": "#/texts/171" - }, - { - "$ref": "#/texts/172" - }, - { - "$ref": "#/texts/173" - }, - { - "$ref": "#/texts/174" - }, - { - "$ref": "#/texts/175" - }, - { - "$ref": "#/texts/176" - }, - { - "$ref": "#/texts/177" - }, - { - "$ref": "#/texts/178" - }, - { - "$ref": "#/texts/179" - }, - { - "$ref": "#/texts/180" - }, - { - "$ref": "#/texts/181" - }, - { - "$ref": "#/texts/182" - }, - { - "$ref": "#/texts/183" - }, - { - "$ref": "#/texts/184" - }, - { - "$ref": "#/texts/185" - }, - { - "$ref": "#/texts/186" - }, - { - "$ref": "#/texts/187" - }, - { - "$ref": "#/texts/188" - }, - { - "$ref": "#/texts/189" - }, - { - "$ref": "#/texts/190" - }, - { - "$ref": "#/texts/191" - }, - { - "$ref": "#/texts/192" - }, - { - "$ref": "#/texts/193" - }, - { - "$ref": "#/texts/194" - }, - { - "$ref": "#/texts/195" - }, - { - "$ref": "#/texts/196" - }, - { - "$ref": "#/texts/197" - }, - { - "$ref": "#/texts/198" - }, - { - "$ref": "#/texts/199" - }, - { - "$ref": "#/texts/200" - }, - { - "$ref": "#/texts/201" - }, - { - "$ref": "#/texts/202" - }, - { - "$ref": "#/texts/203" - }, - { - "$ref": "#/texts/204" - }, - { - "$ref": "#/texts/205" - }, - { - "$ref": "#/texts/206" - }, - { - "$ref": "#/texts/207" - }, - { - "$ref": "#/texts/208" - }, - { - "$ref": "#/texts/209" - }, - { - "$ref": "#/texts/210" - }, - { - "$ref": "#/texts/211" - }, - { - "$ref": "#/texts/212" - }, - { - "$ref": "#/texts/213" - }, - { - "$ref": "#/texts/214" - }, - { - "$ref": "#/texts/215" - }, - { - "$ref": "#/texts/216" - }, - { - "$ref": "#/texts/217" - }, - { - "$ref": "#/texts/218" - }, - { - "$ref": "#/texts/219" - }, - { - "$ref": "#/texts/220" - }, - { - "$ref": "#/texts/221" - }, - { - "$ref": "#/texts/222" - }, - { - "$ref": "#/texts/223" - }, - { - "$ref": "#/texts/224" - }, - { - "$ref": "#/texts/225" - }, - { - "$ref": "#/texts/226" - }, - { - "$ref": "#/texts/227" - }, - { - "$ref": "#/texts/228" - }, - { - "$ref": "#/texts/229" - }, - { - "$ref": "#/texts/230" - }, - { - "$ref": "#/texts/231" - }, - { - "$ref": "#/texts/232" - }, - { - "$ref": "#/texts/233" - }, - { - "$ref": "#/texts/234" - }, - { - "$ref": "#/texts/235" - }, - { - "$ref": "#/texts/236" - }, - { - "$ref": "#/texts/237" - }, - { - "$ref": "#/texts/238" - }, - { - "$ref": "#/texts/239" - }, - { - "$ref": "#/texts/240" - }, - { - "$ref": "#/texts/241" - }, - { - "$ref": "#/texts/242" - }, - { - "$ref": "#/texts/243" - }, - { - "$ref": "#/texts/244" - }, - { - "$ref": "#/texts/245" - }, - { - "$ref": "#/texts/246" - }, - { - "$ref": "#/texts/247" - }, - { - "$ref": "#/texts/248" - }, - { - "$ref": "#/texts/249" - }, - { - "$ref": "#/texts/250" - }, - { - "$ref": "#/texts/251" - }, - { - "$ref": "#/texts/252" - }, - { - "$ref": "#/texts/253" - }, - { - "$ref": "#/texts/254" - }, - { - "$ref": "#/texts/255" - }, - { - "$ref": "#/texts/256" - }, - { - "$ref": "#/texts/257" - }, - { - "$ref": "#/texts/258" - }, - { - "$ref": "#/texts/259" - }, - { - "$ref": "#/texts/260" - }, - { - "$ref": "#/texts/261" - }, - { - "$ref": "#/texts/262" - }, - { - "$ref": "#/texts/263" - }, - { - "$ref": "#/texts/264" - }, - { - "$ref": "#/texts/265" - }, - { - "$ref": "#/texts/266" - }, - { - "$ref": "#/texts/267" - }, - { - "$ref": "#/texts/268" - }, - { - "$ref": "#/texts/269" - }, - { - "$ref": "#/texts/270" - }, - { - "$ref": "#/texts/271" - }, - { - "$ref": "#/texts/272" - }, - { - "$ref": "#/texts/273" - }, - { - "$ref": "#/texts/274" - }, - { - "$ref": "#/texts/275" - }, - { - "$ref": "#/texts/276" - }, - { - "$ref": "#/texts/277" - }, - { - "$ref": "#/texts/278" - }, - { - "$ref": "#/texts/279" - }, - { - "$ref": "#/texts/280" - }, - { - "$ref": "#/texts/281" - }, - { - "$ref": "#/texts/282" - }, - { - "$ref": "#/texts/283" - }, - { - "$ref": "#/texts/284" - }, - { - "$ref": "#/texts/285" - }, - { - "$ref": "#/texts/286" - }, - { - "$ref": "#/texts/287" - }, - { - "$ref": "#/texts/288" - }, - { - "$ref": "#/texts/289" - }, - { - "$ref": "#/texts/290" - }, - { - "$ref": "#/texts/291" - }, - { - "$ref": "#/texts/292" - }, - { - "$ref": "#/texts/293" - }, - { - "$ref": "#/texts/294" - }, - { - "$ref": "#/texts/295" - }, - { - "$ref": "#/texts/296" - }, - { - "$ref": "#/texts/297" - }, - { - "$ref": "#/texts/298" - }, - { - "$ref": "#/texts/299" - }, - { - "$ref": "#/texts/300" - }, - { - "$ref": "#/texts/301" - } - ], - "content_layer": "body", - "meta": {}, - "label": "picture", - "prov": [ - { - "page_no": 7, - "bbox": { - "l": 223.45245361328125, - "t": 606.3411560058594, - "r": 277.1462707519531, - "b": 563.2440032958984, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACM Reference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "In this image we can see a text.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/pictures/3", - "parent": { - "$ref": "#/body" - }, - "children": [ - { - "$ref": "#/texts/302" - }, - { - "$ref": "#/texts/303" - }, - { - "$ref": "#/texts/304" - }, - { - "$ref": "#/texts/305" - }, - { - "$ref": "#/texts/306" - }, - { - "$ref": "#/texts/307" - }, - { - "$ref": "#/texts/308" - }, - { - "$ref": "#/texts/309" - }, - { - "$ref": "#/texts/310" - }, - { - "$ref": "#/texts/311" - }, - { - "$ref": "#/texts/312" - }, - { - "$ref": "#/texts/313" - }, - { - "$ref": "#/texts/314" - }, - { - "$ref": "#/texts/315" - }, - { - "$ref": "#/texts/316" - }, - { - "$ref": "#/texts/317" - }, - { - "$ref": "#/texts/318" - }, - { - "$ref": "#/texts/319" - }, - { - "$ref": "#/texts/320" - }, - { - "$ref": "#/texts/321" - }, - { - "$ref": "#/texts/322" - }, - { - "$ref": "#/texts/323" - }, - { - "$ref": "#/texts/324" - }, - { - "$ref": "#/texts/325" - }, - { - "$ref": "#/texts/326" - }, - { - "$ref": "#/texts/327" - }, - { - "$ref": "#/texts/328" - }, - { - "$ref": "#/texts/329" - }, - { - "$ref": "#/texts/330" - }, - { - "$ref": "#/texts/331" - }, - { - "$ref": "#/texts/332" - }, - { - "$ref": "#/texts/333" - }, - { - "$ref": "#/texts/334" - }, - { - "$ref": "#/texts/335" - }, - { - "$ref": "#/texts/336" - }, - { - "$ref": "#/texts/337" - }, - { - "$ref": "#/texts/338" - }, - { - "$ref": "#/texts/339" - }, - { - "$ref": "#/texts/340" - }, - { - "$ref": "#/texts/341" - }, - { - "$ref": "#/texts/342" - }, - { - "$ref": "#/texts/343" - }, - { - "$ref": "#/texts/344" - }, - { - "$ref": "#/texts/345" - }, - { - "$ref": "#/texts/346" - }, - { - "$ref": "#/texts/347" - }, - { - "$ref": "#/texts/348" - }, - { - "$ref": "#/texts/349" - }, - { - "$ref": "#/texts/350" - }, - { - "$ref": "#/texts/351" - }, - { - "$ref": "#/texts/352" - }, - { - "$ref": "#/texts/353" - }, - { - "$ref": "#/texts/354" - }, - { - "$ref": "#/texts/355" - }, - { - "$ref": "#/texts/356" - }, - { - "$ref": "#/texts/357" - }, - { - "$ref": "#/texts/358" - }, - { - "$ref": "#/texts/359" - }, - { - "$ref": "#/texts/360" - }, - { - "$ref": "#/texts/361" - }, - { - "$ref": "#/texts/362" - }, - { - "$ref": "#/texts/363" - }, - { - "$ref": "#/texts/364" - }, - { - "$ref": "#/texts/365" - }, - { - "$ref": "#/texts/366" - }, - { - "$ref": "#/texts/367" - }, - { - "$ref": "#/texts/368" - }, - { - "$ref": "#/texts/369" - }, - { - "$ref": "#/texts/370" - }, - { - "$ref": "#/texts/371" - }, - { - "$ref": "#/texts/372" - }, - { - "$ref": "#/texts/373" - }, - { - "$ref": "#/texts/374" - }, - { - "$ref": "#/texts/375" - }, - { - "$ref": "#/texts/376" - }, - { - "$ref": "#/texts/377" - }, - { - "$ref": "#/texts/378" - }, - { - "$ref": "#/texts/379" - }, - { - "$ref": "#/texts/380" - }, - { - "$ref": "#/texts/381" - }, - { - "$ref": "#/texts/382" - }, - { - "$ref": "#/texts/383" - }, - { - "$ref": "#/texts/384" - }, - { - "$ref": "#/texts/385" - }, - { - "$ref": "#/texts/386" - }, - { - "$ref": "#/texts/387" - }, - { - "$ref": "#/texts/388" - }, - { - "$ref": "#/texts/389" - }, - { - "$ref": "#/texts/390" - }, - { - "$ref": "#/texts/391" - }, - { - "$ref": "#/texts/392" - } - ], - "content_layer": "body", - "meta": {}, - "label": "picture", - "prov": [ - { - "page_no": 7, - "bbox": { - "l": 279.03204345703125, - "t": 607.0251770019531, - "r": 312.2338562011719, - "b": 562.7499389648438, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACM Reference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "AGL Energy Limited ABN 74 1", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/393", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 7, - "bbox": { - "l": 226.786, - "t": 560.516, - "r": 233.176, - "b": 559.937, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 28 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACM Reference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "5 061 375", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/394", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 7, - "bbox": { - "l": 233.40500000000003, - "t": 560.516, - "r": 235.66499999999996, - "b": 559.937, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 9 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACM Reference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "In this image I can see the text on the image.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/pictures/4", - "parent": { - "$ref": "#/body" - }, - "children": [ - { - "$ref": "#/texts/395" - }, - { - "$ref": "#/texts/396" - }, - { - "$ref": "#/texts/397" - }, - { - "$ref": "#/texts/398" - }, - { - "$ref": "#/texts/399" - }, - { - "$ref": "#/texts/400" - }, - { - "$ref": "#/texts/401" - }, - { - "$ref": "#/texts/402" - }, - { - "$ref": "#/texts/403" - }, - { - "$ref": "#/texts/404" - }, - { - "$ref": "#/texts/405" - }, - { - "$ref": "#/texts/406" - }, - { - "$ref": "#/texts/407" - }, - { - "$ref": "#/texts/408" - }, - { - "$ref": "#/texts/409" - }, - { - "$ref": "#/texts/410" - }, - { - "$ref": "#/texts/411" - }, - { - "$ref": "#/texts/412" - }, - { - "$ref": "#/texts/413" - }, - { - "$ref": "#/texts/414" - }, - { - "$ref": "#/texts/415" - }, - { - "$ref": "#/texts/416" - }, - { - "$ref": "#/texts/417" - }, - { - "$ref": "#/texts/418" - }, - { - "$ref": "#/texts/419" - }, - { - "$ref": "#/texts/420" - }, - { - "$ref": "#/texts/421" - }, - { - "$ref": "#/texts/422" - }, - { - "$ref": "#/texts/423" - }, - { - "$ref": "#/texts/424" - }, - { - "$ref": "#/texts/425" - }, - { - "$ref": "#/texts/426" - }, - { - "$ref": "#/texts/427" - }, - { - "$ref": "#/texts/428" - }, - { - "$ref": "#/texts/429" - }, - { - "$ref": "#/texts/430" - }, - { - "$ref": "#/texts/431" - }, - { - "$ref": "#/texts/432" - }, - { - "$ref": "#/texts/433" - }, - { - "$ref": "#/texts/434" - }, - { - "$ref": "#/texts/435" - }, - { - "$ref": "#/texts/436" - }, - { - "$ref": "#/texts/437" - }, - { - "$ref": "#/texts/438" - }, - { - "$ref": "#/texts/439" - }, - { - "$ref": "#/texts/440" - } - ], - "content_layer": "body", - "meta": {}, - "label": "picture", - "prov": [ - { - "page_no": 7, - "bbox": { - "l": 224.6795196533203, - "t": 559.731201171875, - "r": 268.13018798828125, - "b": 503.4937438964844, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACM Reference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "In this image there is a paper with some text on it.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/pictures/5", - "parent": { - "$ref": "#/body" - }, - "children": [ - { - "$ref": "#/texts/441" - }, - { - "$ref": "#/texts/442" - }, - { - "$ref": "#/texts/443" - }, - { - "$ref": "#/texts/444" - }, - { - "$ref": "#/texts/445" - }, - { - "$ref": "#/texts/446" - }, - { - "$ref": "#/texts/447" - }, - { - "$ref": "#/texts/448" - }, - { - "$ref": "#/texts/449" - }, - { - "$ref": "#/texts/450" - }, - { - "$ref": "#/texts/451" - }, - { - "$ref": "#/texts/452" - }, - { - "$ref": "#/texts/453" - }, - { - "$ref": "#/texts/454" - }, - { - "$ref": "#/texts/455" - }, - { - "$ref": "#/texts/456" - }, - { - "$ref": "#/texts/457" - }, - { - "$ref": "#/texts/458" - }, - { - "$ref": "#/texts/459" - }, - { - "$ref": "#/texts/460" - }, - { - "$ref": "#/texts/461" - }, - { - "$ref": "#/texts/462" - }, - { - "$ref": "#/texts/463" - }, - { - "$ref": "#/texts/464" - }, - { - "$ref": "#/texts/465" - }, - { - "$ref": "#/texts/466" - }, - { - "$ref": "#/texts/467" - }, - { - "$ref": "#/texts/468" - }, - { - "$ref": "#/texts/469" - }, - { - "$ref": "#/texts/470" - }, - { - "$ref": "#/texts/471" - }, - { - "$ref": "#/texts/472" - }, - { - "$ref": "#/texts/473" - }, - { - "$ref": "#/texts/474" - }, - { - "$ref": "#/texts/475" - }, - { - "$ref": "#/texts/476" - }, - { - "$ref": "#/texts/477" - }, - { - "$ref": "#/texts/478" - }, - { - "$ref": "#/texts/479" - }, - { - "$ref": "#/texts/480" - }, - { - "$ref": "#/texts/481" - }, - { - "$ref": "#/texts/482" - }, - { - "$ref": "#/texts/483" - }, - { - "$ref": "#/texts/484" - }, - { - "$ref": "#/texts/485" - }, - { - "$ref": "#/texts/486" - }, - { - "$ref": "#/texts/487" - }, - { - "$ref": "#/texts/488" - }, - { - "$ref": "#/texts/489" - }, - { - "$ref": "#/texts/490" - }, - { - "$ref": "#/texts/491" - }, - { - "$ref": "#/texts/492" - }, - { - "$ref": "#/texts/493" - }, - { - "$ref": "#/texts/494" - }, - { - "$ref": "#/texts/495" - }, - { - "$ref": "#/texts/496" - }, - { - "$ref": "#/texts/497" - }, - { - "$ref": "#/texts/498" - }, - { - "$ref": "#/texts/499" - }, - { - "$ref": "#/texts/500" - }, - { - "$ref": "#/texts/501" - }, - { - "$ref": "#/texts/502" - } - ], - "content_layer": "body", - "meta": {}, - "label": "picture", - "prov": [ - { - "page_no": 7, - "bbox": { - "l": 269.2328186035156, - "t": 558.8644409179688, - "r": 311.74884033203125, - "b": 502.994873046875, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACM Reference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Figure 1: Four examples of complex page layouts across different document categories", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/503", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 7, - "bbox": { - "l": 222.539, - "t": 499.2799999999999, - "r": 312.251, - "b": 490.75200000000007, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 84 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACM Reference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "PDF document conversion, layout segmentation, object-detection, data set, Machine Learning", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/505", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 7, - "bbox": { - "l": 222.539, - "t": 474.62299999999993, - "r": 312.021, - "b": 465.961, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 90 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "KEYWORDS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/507", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 7, - "bbox": { - "l": 222.539, - "t": 458.719, - "r": 312.156, - "b": 436.156, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 374 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "1 INTRODUCTION", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/508", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 7, - "bbox": { - "l": 329.602, - "t": 428.537, - "r": 373.375, - "b": 423.963, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 14 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1. Figure 2: Title page of the DocLayNet paper (arxiv.org/pdf/2206.01062) - left PDF, right rendered Markdown. If recognized, metadata such as authors are appearing first under the title. Text content inside figures is currently dropped, the caption is retained and linked to the figure in the JSON representation (not shown).", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/509", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 7, - "bbox": { - "l": 108.0, - "t": 419.051, - "r": 527.591, - "b": 377.77099999999996, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 1026 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/511", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 122.99899999999998, - "t": 563.105, - "r": 338.603, - "b": 558.655, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 130 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/512", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 122.87200000000001, - "t": 552.103, - "r": 226.37599999999998, - "b": 509.485, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 489 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "| | human | MRCNN R50 R101 | FRCNN R101 | YOLO v5x6 |\n|--------------------------------------------------------------------------------------------------------|-------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------|--------------------------------------------------------|\n| Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All | 84-89 83-91 83-85 87-88 93-94 85-89 69-71 83-84 77-81 84-86 | 68.4 71.5 70.9 60.1 63.4 81.2 80.8 61.6 59.3 71.9 70.0 71.7 72.7 67.6 69.3 82.2 82.9 85.8 76.7 80.4 72.4 73.5 | 70.1 73.7 63.5 81.0 58.9 72.0 72.0 68.4 82.2 85.4 79.9 73.4 | 77.7 77.2 66.2 86.2 61.1 67.9 74.6 86.3 88.1 82.7 76.8 |", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/tables/1", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "table", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 125.8864517211914, - "t": 505.50439453125, - "r": 223.0050506591797, - "b": 437.8017272949219, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/513", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 122.884, - "t": 431.161, - "r": 226.336, - "b": 341.5470000000001, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 1252 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/515", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 122.86499999999998, - "t": 327.581, - "r": 226.282, - "b": 284.81, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 584 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "5 EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "In this image, we can see a table.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/pictures/6", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "meta": {}, - "label": "picture", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 366.8663635253906, - "t": 542.9663391113281, - "r": 460.8086242675781, - "b": 450.9350280761719, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "5 EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Third, achienec", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/516", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 436.0, - "t": 447.0, - "r": 509.66666666666663, - "b": 418.66666666666663, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 15 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "5 EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "chalenongayouls ground-vuth dawa such WC", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/518", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 366.0, - "t": 386.0, - "r": 529.3333333333334, - "b": 375.33333333333337, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 40 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "The image is a line graph that shows the percentage of respondents who have used a specific type of training in the past 24 hours. The x-axis represents the time in hours, while the y-axis represents the percentage of respondents. The graph is titled \"Training.\"\n\nThe graph has two lines: one for the training type and one for the training duration. The training type line is labeled \"Training\" and the training duration line is labeled \"Training Duration.\"\n\nThe graph shows that the percentage of respondents who have used the training type increases as the time increases. Specifically, the percentage of respondents who have used the training type increases from 10% to 20% over the 24-hour period. The percentage of respondents who have used the training duration increases from 10% to 20% over the 24-hour period.\n\nThe graph also shows that the percentage of respondents who have used the training type increases as the", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/pictures/7", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "meta": {}, - "label": "picture", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 237.6404266357422, - "t": 550.1458740234375, - "r": 337.0112609863281, - "b": 477.0093078613281, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/519", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 235.911, - "t": 469.97300000000007, - "r": 339.288, - "b": 441.408, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 322 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/520", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 235.911, - "t": 425.568, - "r": 338.603, - "b": 415.587, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 102 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16].", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/521", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 235.776, - "t": 416.19999999999993, - "r": 338.703, - "b": 382.7970000000001, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 397 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 \u00d7 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/523", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 235.823, - "t": 370.85, - "r": 338.7, - "b": 285.921, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 1146 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "coioct dcochon modols", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/524", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 456.6666666666667, - "t": 344.0, - "r": 485.33333333333337, - "b": 341.33333333333337, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 21 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "mak enbrel", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/526", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 470.6666666666667, - "t": 308.6666666666667, - "r": 524.0, - "b": 285.3333333333333, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 10 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Figure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/527", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 8, - "bbox": { - "l": 108.0, - "t": 266.424, - "r": 504.00300000000004, - "b": 225.14499999999998, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 393 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "KDD '22, August 14-18, 2022, Washington, DC, USA", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/529", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 88.676, - "t": 598.985, - "r": 186.95, - "b": 593.669, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 48 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/530", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 190.471, - "t": 598.985, - "r": 346.254, - "b": 593.669, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 81 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as %", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/531", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 88.525, - "t": 586.821, - "r": 346.401, - "b": 580.676, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 123 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/532", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 88.676, - "t": 575.628, - "r": 301.135, - "b": 569.484, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 99 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/533", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 88.676, - "t": 581.225, - "r": 346.254, - "b": 575.08, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 124 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "The image consists of a blue circle with the letter \"A\" written in white color. The circle is placed on a white background, which makes the letter \"A\" stand out prominently. The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom. The gradient effect gives a sense of depth and movement, making the letter \"A\" stand out more.\n\n### Description of Objects in the Image:\n1. **Circle**: The central element of the image is a blue circle.\n2. **White Letter \"A\"**: The letter \"A\" is written in white color.\n3. **Background**: The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom.\n4. **Color Gradient**: The gradient effect is present, with the color transitioning from a lighter shade at the top to a darker shade at the bottom.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/pictures/8", - "parent": { - "$ref": "#/body" - }, - "children": [ - { - "$ref": "#/texts/534" - } - ], - "content_layer": "body", - "meta": {}, - "label": "picture", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 110.43017578125, - "t": 573.9806060791016, - "r": 124.71578216552734, - "b": 559.4710540771484, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "In this image, there is a table with two columns. The first column is labeled \"Class label,\" and the second column is labeled \"Count.\" The first row in the table has the label \"Class label,\" and the count is 22524. The second row in the table has the label \"Count,\" and the count is 204.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/pictures/9", - "parent": { - "$ref": "#/body" - }, - "children": [ - { - "$ref": "#/texts/535" - }, - { - "$ref": "#/texts/536" - }, - { - "$ref": "#/texts/537" - }, - { - "$ref": "#/texts/538" - }, - { - "$ref": "#/texts/539" - }, - { - "$ref": "#/texts/540" - }, - { - "$ref": "#/texts/541" - }, - { - "$ref": "#/texts/542" - }, - { - "$ref": "#/texts/543" - }, - { - "$ref": "#/texts/544" - }, - { - "$ref": "#/texts/545" - }, - { - "$ref": "#/texts/546" - }, - { - "$ref": "#/texts/547" - }, - { - "$ref": "#/texts/548" - }, - { - "$ref": "#/texts/549" - }, - { - "$ref": "#/texts/550" - }, - { - "$ref": "#/texts/551" - }, - { - "$ref": "#/texts/552" - }, - { - "$ref": "#/texts/553" - }, - { - "$ref": "#/texts/554" - }, - { - "$ref": "#/texts/555" - }, - { - "$ref": "#/texts/556" - }, - { - "$ref": "#/texts/557" - }, - { - "$ref": "#/texts/558" - }, - { - "$ref": "#/texts/559" - }, - { - "$ref": "#/texts/560" - }, - { - "$ref": "#/texts/561" - }, - { - "$ref": "#/texts/562" - }, - { - "$ref": "#/texts/563" - }, - { - "$ref": "#/texts/564" - }, - { - "$ref": "#/texts/565" - }, - { - "$ref": "#/texts/566" - }, - { - "$ref": "#/texts/567" - }, - { - "$ref": "#/texts/568" - }, - { - "$ref": "#/texts/569" - }, - { - "$ref": "#/texts/570" - }, - { - "$ref": "#/texts/571" - }, - { - "$ref": "#/texts/572" - }, - { - "$ref": "#/texts/573" - }, - { - "$ref": "#/texts/574" - }, - { - "$ref": "#/texts/575" - }, - { - "$ref": "#/texts/576" - }, - { - "$ref": "#/texts/577" - }, - { - "$ref": "#/texts/578" - }, - { - "$ref": "#/texts/579" - }, - { - "$ref": "#/texts/580" - }, - { - "$ref": "#/texts/581" - }, - { - "$ref": "#/texts/582" - }, - { - "$ref": "#/texts/583" - }, - { - "$ref": "#/texts/584" - }, - { - "$ref": "#/texts/585" - }, - { - "$ref": "#/texts/586" - }, - { - "$ref": "#/texts/587" - }, - { - "$ref": "#/texts/588" - }, - { - "$ref": "#/texts/589" - }, - { - "$ref": "#/texts/590" - }, - { - "$ref": "#/texts/591" - }, - { - "$ref": "#/texts/592" - }, - { - "$ref": "#/texts/593" - }, - { - "$ref": "#/texts/594" - }, - { - "$ref": "#/texts/595" - }, - { - "$ref": "#/texts/596" - }, - { - "$ref": "#/texts/597" - }, - { - "$ref": "#/texts/598" - }, - { - "$ref": "#/texts/599" - }, - { - "$ref": "#/texts/600" - }, - { - "$ref": "#/texts/601" - }, - { - "$ref": "#/texts/602" - }, - { - "$ref": "#/texts/603" - }, - { - "$ref": "#/texts/604" - }, - { - "$ref": "#/texts/605" - }, - { - "$ref": "#/texts/606" - }, - { - "$ref": "#/texts/607" - }, - { - "$ref": "#/texts/608" - }, - { - "$ref": "#/texts/609" - }, - { - "$ref": "#/texts/610" - }, - { - "$ref": "#/texts/611" - }, - { - "$ref": "#/texts/612" - }, - { - "$ref": "#/texts/613" - }, - { - "$ref": "#/texts/614" - }, - { - "$ref": "#/texts/615" - }, - { - "$ref": "#/texts/616" - }, - { - "$ref": "#/texts/617" - }, - { - "$ref": "#/texts/618" - }, - { - "$ref": "#/texts/619" - }, - { - "$ref": "#/texts/620" - }, - { - "$ref": "#/texts/621" - }, - { - "$ref": "#/texts/622" - }, - { - "$ref": "#/texts/623" - }, - { - "$ref": "#/texts/624" - }, - { - "$ref": "#/texts/625" - }, - { - "$ref": "#/texts/626" - }, - { - "$ref": "#/texts/627" - }, - { - "$ref": "#/texts/628" - }, - { - "$ref": "#/texts/629" - }, - { - "$ref": "#/texts/630" - }, - { - "$ref": "#/texts/631" - }, - { - "$ref": "#/texts/632" - }, - { - "$ref": "#/texts/633" - }, - { - "$ref": "#/texts/634" - }, - { - "$ref": "#/texts/635" - }, - { - "$ref": "#/texts/636" - }, - { - "$ref": "#/texts/637" - }, - { - "$ref": "#/texts/638" - }, - { - "$ref": "#/texts/639" - }, - { - "$ref": "#/texts/640" - }, - { - "$ref": "#/texts/641" - }, - { - "$ref": "#/texts/642" - }, - { - "$ref": "#/texts/643" - }, - { - "$ref": "#/texts/644" - }, - { - "$ref": "#/texts/645" - }, - { - "$ref": "#/texts/646" - }, - { - "$ref": "#/texts/647" - }, - { - "$ref": "#/texts/648" - }, - { - "$ref": "#/texts/649" - }, - { - "$ref": "#/texts/650" - }, - { - "$ref": "#/texts/651" - }, - { - "$ref": "#/texts/652" - }, - { - "$ref": "#/texts/653" - }, - { - "$ref": "#/texts/654" - }, - { - "$ref": "#/texts/655" - }, - { - "$ref": "#/texts/656" - }, - { - "$ref": "#/texts/657" - }, - { - "$ref": "#/texts/658" - }, - { - "$ref": "#/texts/659" - }, - { - "$ref": "#/texts/660" - }, - { - "$ref": "#/texts/661" - }, - { - "$ref": "#/texts/662" - }, - { - "$ref": "#/texts/663" - }, - { - "$ref": "#/texts/664" - }, - { - "$ref": "#/texts/665" - }, - { - "$ref": "#/texts/666" - }, - { - "$ref": "#/texts/667" - }, - { - "$ref": "#/texts/668" - }, - { - "$ref": "#/texts/669" - }, - { - "$ref": "#/texts/670" - }, - { - "$ref": "#/texts/671" - }, - { - "$ref": "#/texts/672" - }, - { - "$ref": "#/texts/673" - }, - { - "$ref": "#/texts/674" - }, - { - "$ref": "#/texts/675" - }, - { - "$ref": "#/texts/676" - }, - { - "$ref": "#/texts/677" - }, - { - "$ref": "#/texts/678" - }, - { - "$ref": "#/texts/679" - }, - { - "$ref": "#/texts/680" - }, - { - "$ref": "#/texts/681" - }, - { - "$ref": "#/texts/682" - }, - { - "$ref": "#/texts/683" - }, - { - "$ref": "#/texts/684" - }, - { - "$ref": "#/texts/685" - }, - { - "$ref": "#/texts/686" - }, - { - "$ref": "#/texts/687" - }, - { - "$ref": "#/texts/688" - }, - { - "$ref": "#/texts/689" - }, - { - "$ref": "#/texts/690" - }, - { - "$ref": "#/texts/691" - }, - { - "$ref": "#/texts/692" - }, - { - "$ref": "#/texts/693" - } - ], - "content_layer": "body", - "meta": {}, - "label": "picture", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 110.8309097290039, - "t": 560.6356811523438, - "r": 323.92962646484375, - "b": 477.741455078125, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n|----------------|---------|--------------|--------------|--------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|\n| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten |\n| Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a |\n| Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 |\n| Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a |\n| List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 |\n| Page-footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 |\n| Page-header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 |\n| Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 |\n| Section-header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 |\n| Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 |\n| Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 |\n| Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 |\n| Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 |", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/tables/3", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "table", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 110.8309097290039, - "t": 560.6356811523438, - "r": 323.92962646484375, - "b": 477.741455078125, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "In this image I can see a blue circle.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/pictures/10", - "parent": { - "$ref": "#/body" - }, - "children": [ - { - "$ref": "#/texts/694" - } - ], - "content_layer": "body", - "meta": {}, - "label": "picture", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 332.130615234375, - "t": 576.3017578125, - "r": 346.93829345703125, - "b": 560.4401550292969, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "include publication repositories such as arXiv", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/695", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 223.57, - "t": 471.407, - "r": 306.847, - "b": 465.079, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 46 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row \"Total\") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/696", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 335.152, - "t": 573.216, - "r": 521.726, - "b": 570.514, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 146 - ] - }, - { - "page_no": 9, - "bbox": { - "l": 335.152, - "t": 573.216, - "r": 521.726, - "b": 570.514, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 147, - 294 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "annotated pages, from which we obtain accuracy ranges.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/697", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 335.152, - "t": 564.097, - "r": 408.543, - "b": 561.395, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 54 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "A table with different columns and rows.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/pictures/11", - "parent": { - "$ref": "#/body" - }, - "children": [ - { - "$ref": "#/texts/698" - }, - { - "$ref": "#/texts/699" - }, - { - "$ref": "#/texts/700" - }, - { - "$ref": "#/texts/701" - }, - { - "$ref": "#/texts/702" - }, - { - "$ref": "#/texts/703" - }, - { - "$ref": "#/texts/704" - }, - { - "$ref": "#/texts/705" - }, - { - "$ref": "#/texts/706" - }, - { - "$ref": "#/texts/707" - }, - { - "$ref": "#/texts/708" - }, - { - "$ref": "#/texts/709" - }, - { - "$ref": "#/texts/710" - }, - { - "$ref": "#/texts/711" - }, - { - "$ref": "#/texts/712" - }, - { - "$ref": "#/texts/713" - }, - { - "$ref": "#/texts/714" - }, - { - "$ref": "#/texts/715" - }, - { - "$ref": "#/texts/716" - }, - { - "$ref": "#/texts/717" - }, - { - "$ref": "#/texts/718" - }, - { - "$ref": "#/texts/719" - }, - { - "$ref": "#/texts/720" - }, - { - "$ref": "#/texts/721" - }, - { - "$ref": "#/texts/722" - }, - { - "$ref": "#/texts/723" - }, - { - "$ref": "#/texts/724" - }, - { - "$ref": "#/texts/725" - }, - { - "$ref": "#/texts/726" - }, - { - "$ref": "#/texts/727" - }, - { - "$ref": "#/texts/728" - }, - { - "$ref": "#/texts/729" - }, - { - "$ref": "#/texts/730" - }, - { - "$ref": "#/texts/731" - }, - { - "$ref": "#/texts/732" - }, - { - "$ref": "#/texts/733" - }, - { - "$ref": "#/texts/734" - }, - { - "$ref": "#/texts/735" - }, - { - "$ref": "#/texts/736" - }, - { - "$ref": "#/texts/737" - }, - { - "$ref": "#/texts/738" - }, - { - "$ref": "#/texts/739" - }, - { - "$ref": "#/texts/740" - }, - { - "$ref": "#/texts/741" - }, - { - "$ref": "#/texts/742" - }, - { - "$ref": "#/texts/743" - }, - { - "$ref": "#/texts/744" - }, - { - "$ref": "#/texts/745" - }, - { - "$ref": "#/texts/746" - }, - { - "$ref": "#/texts/747" - }, - { - "$ref": "#/texts/748" - }, - { - "$ref": "#/texts/749" - }, - { - "$ref": "#/texts/750" - }, - { - "$ref": "#/texts/751" - }, - { - "$ref": "#/texts/752" - }, - { - "$ref": "#/texts/753" - }, - { - "$ref": "#/texts/754" - }, - { - "$ref": "#/texts/755" - }, - { - "$ref": "#/texts/756" - }, - { - "$ref": "#/texts/757" - }, - { - "$ref": "#/texts/758" - }, - { - "$ref": "#/texts/759" - }, - { - "$ref": "#/texts/760" - }, - { - "$ref": "#/texts/761" - }, - { - "$ref": "#/texts/762" - }, - { - "$ref": "#/texts/763" - }, - { - "$ref": "#/texts/764" - }, - { - "$ref": "#/texts/765" - }, - { - "$ref": "#/texts/766" - }, - { - "$ref": "#/texts/767" - }, - { - "$ref": "#/texts/768" - }, - { - "$ref": "#/texts/769" - }, - { - "$ref": "#/texts/770" - }, - { - "$ref": "#/texts/771" - }, - { - "$ref": "#/texts/772" - }, - { - "$ref": "#/texts/773" - }, - { - "$ref": "#/texts/774" - }, - { - "$ref": "#/texts/775" - }, - { - "$ref": "#/texts/776" - }, - { - "$ref": "#/texts/777" - }, - { - "$ref": "#/texts/778" - }, - { - "$ref": "#/texts/779" - }, - { - "$ref": "#/texts/780" - }, - { - "$ref": "#/texts/781" - }, - { - "$ref": "#/texts/782" - }, - { - "$ref": "#/texts/783" - }, - { - "$ref": "#/texts/784" - }, - { - "$ref": "#/texts/785" - }, - { - "$ref": "#/texts/786" - }, - { - "$ref": "#/texts/787" - }, - { - "$ref": "#/texts/788" - }, - { - "$ref": "#/texts/789" - }, - { - "$ref": "#/texts/790" - }, - { - "$ref": "#/texts/791" - }, - { - "$ref": "#/texts/792" - }, - { - "$ref": "#/texts/793" - }, - { - "$ref": "#/texts/794" - }, - { - "$ref": "#/texts/795" - }, - { - "$ref": "#/texts/796" - }, - { - "$ref": "#/texts/797" - }, - { - "$ref": "#/texts/798" - }, - { - "$ref": "#/texts/799" - }, - { - "$ref": "#/texts/800" - }, - { - "$ref": "#/texts/801" - }, - { - "$ref": "#/texts/802" - }, - { - "$ref": "#/texts/803" - }, - { - "$ref": "#/texts/804" - }, - { - "$ref": "#/texts/805" - }, - { - "$ref": "#/texts/806" - }, - { - "$ref": "#/texts/807" - }, - { - "$ref": "#/texts/808" - }, - { - "$ref": "#/texts/809" - }, - { - "$ref": "#/texts/810" - }, - { - "$ref": "#/texts/811" - }, - { - "$ref": "#/texts/812" - }, - { - "$ref": "#/texts/813" - }, - { - "$ref": "#/texts/814" - }, - { - "$ref": "#/texts/815" - }, - { - "$ref": "#/texts/816" - }, - { - "$ref": "#/texts/817" - }, - { - "$ref": "#/texts/818" - }, - { - "$ref": "#/texts/819" - }, - { - "$ref": "#/texts/820" - }, - { - "$ref": "#/texts/821" - }, - { - "$ref": "#/texts/822" - }, - { - "$ref": "#/texts/823" - }, - { - "$ref": "#/texts/824" - }, - { - "$ref": "#/texts/825" - }, - { - "$ref": "#/texts/826" - }, - { - "$ref": "#/texts/827" - }, - { - "$ref": "#/texts/828" - }, - { - "$ref": "#/texts/829" - }, - { - "$ref": "#/texts/830" - }, - { - "$ref": "#/texts/831" - }, - { - "$ref": "#/texts/832" - }, - { - "$ref": "#/texts/833" - }, - { - "$ref": "#/texts/834" - }, - { - "$ref": "#/texts/835" - }, - { - "$ref": "#/texts/836" - }, - { - "$ref": "#/texts/837" - }, - { - "$ref": "#/texts/838" - }, - { - "$ref": "#/texts/839" - }, - { - "$ref": "#/texts/840" - }, - { - "$ref": "#/texts/841" - }, - { - "$ref": "#/texts/842" - }, - { - "$ref": "#/texts/843" - }, - { - "$ref": "#/texts/844" - }, - { - "$ref": "#/texts/845" - }, - { - "$ref": "#/texts/846" - }, - { - "$ref": "#/texts/847" - }, - { - "$ref": "#/texts/848" - }, - { - "$ref": "#/texts/849" - }, - { - "$ref": "#/texts/850" - }, - { - "$ref": "#/texts/851" - }, - { - "$ref": "#/texts/852" - }, - { - "$ref": "#/texts/853" - }, - { - "$ref": "#/texts/854" - }, - { - "$ref": "#/texts/855" - }, - { - "$ref": "#/texts/856" - }, - { - "$ref": "#/texts/857" - }, - { - "$ref": "#/texts/858" - }, - { - "$ref": "#/texts/859" - }, - { - "$ref": "#/texts/860" - }, - { - "$ref": "#/texts/861" - }, - { - "$ref": "#/texts/862" - }, - { - "$ref": "#/texts/863" - }, - { - "$ref": "#/texts/864" - }, - { - "$ref": "#/texts/865" - }, - { - "$ref": "#/texts/866" - }, - { - "$ref": "#/texts/867" - }, - { - "$ref": "#/texts/868" - }, - { - "$ref": "#/texts/869" - }, - { - "$ref": "#/texts/870" - }, - { - "$ref": "#/texts/871" - }, - { - "$ref": "#/texts/872" - }, - { - "$ref": "#/texts/873" - }, - { - "$ref": "#/texts/874" - }, - { - "$ref": "#/texts/875" - }, - { - "$ref": "#/texts/876" - }, - { - "$ref": "#/texts/877" - }, - { - "$ref": "#/texts/878" - }, - { - "$ref": "#/texts/879" - }, + "page_no": 7, + "bbox": { + "l": 222.539, + "t": 474.62299999999993, + "r": 312.021, + "b": 465.961, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 90 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "KEYWORDS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/507", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/880" - }, + "page_no": 7, + "bbox": { + "l": 222.539, + "t": 458.719, + "r": 312.156, + "b": 436.156, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 374 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "1 INTRODUCTION", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/508", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/881" - }, + "page_no": 7, + "bbox": { + "l": 329.602, + "t": 428.537, + "r": 373.375, + "b": 423.963, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 14 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1. Figure 2: Title page of the DocLayNet paper (arxiv.org/pdf/2206.01062) - left PDF, right rendered Markdown. If recognized, metadata such as authors are appearing first under the title. Text content inside figures is currently dropped, the caption is retained and linked to the figure in the JSON representation (not shown).", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/509", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/882" - }, + "page_no": 7, + "bbox": { + "l": 108.0, + "t": 419.051, + "r": 527.591, + "b": 377.77099999999996, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 1026 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/511", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/883" - }, + "page_no": 8, + "bbox": { + "l": 122.99899999999998, + "t": 563.105, + "r": 338.603, + "b": 558.655, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 130 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/512", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/884" - }, + "page_no": 8, + "bbox": { + "l": 122.87200000000001, + "t": 552.103, + "r": 226.37599999999998, + "b": 509.485, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 489 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "| | human | MRCNN R50 R101 | FRCNN R101 | YOLO v5x6 |\n|--------------------------------------------------------------------------------------------------------|-------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------|--------------------------------------------------------|\n| Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All | 84-89 83-91 83-85 87-88 93-94 85-89 69-71 83-84 77-81 84-86 | 68.4 71.5 70.9 60.1 63.4 81.2 80.8 61.6 59.3 71.9 70.0 71.7 72.7 67.6 69.3 82.2 82.9 85.8 76.7 80.4 72.4 73.5 | 70.1 73.7 63.5 81.0 58.9 72.0 72.0 68.4 82.2 85.4 79.9 73.4 | 77.7 77.2 66.2 86.2 61.1 67.9 74.6 86.3 88.1 82.7 76.8 |", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/tables/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [ { - "$ref": "#/texts/885" - }, + "page_no": 8, + "bbox": { + "l": 125.8864517211914, + "t": 505.50439453125, + "r": 223.0050506591797, + "b": 437.8017272949219, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/513", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/886" - }, + "page_no": 8, + "bbox": { + "l": 122.884, + "t": 431.161, + "r": 226.336, + "b": 341.5470000000001, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 1252 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/515", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/887" - }, + "page_no": 8, + "bbox": { + "l": 122.86499999999998, + "t": 327.581, + "r": 226.282, + "b": 284.81, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 584 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "5 EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Third, achienec", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/516", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/888" - }, + "page_no": 8, + "bbox": { + "l": 436.0, + "t": 447.0, + "r": 509.66666666666663, + "b": 418.66666666666663, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 15 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "5 EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "chalenongayouls ground-vuth dawa such WC", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/518", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/889" - }, + "page_no": 8, + "bbox": { + "l": 366.0, + "t": 386.0, + "r": 529.3333333333334, + "b": 375.33333333333337, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 40 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/519", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/890" - }, + "page_no": 8, + "bbox": { + "l": 235.911, + "t": 469.97300000000007, + "r": 339.288, + "b": 441.408, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 322 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/520", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/891" - }, + "page_no": 8, + "bbox": { + "l": 235.911, + "t": 425.568, + "r": 338.603, + "b": 415.587, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 102 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16].", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/521", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/892" - }, + "page_no": 8, + "bbox": { + "l": 235.776, + "t": 416.19999999999993, + "r": 338.703, + "b": 382.7970000000001, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 397 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 \u00d7 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/523", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/893" - }, + "page_no": 8, + "bbox": { + "l": 235.823, + "t": 370.85, + "r": 338.7, + "b": 285.921, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 1146 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "coioct dcochon modols", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/524", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/894" - }, + "page_no": 8, + "bbox": { + "l": 456.6666666666667, + "t": 344.0, + "r": 485.33333333333337, + "b": 341.33333333333337, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 21 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "mak enbrel", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/526", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/895" - }, + "page_no": 8, + "bbox": { + "l": 470.6666666666667, + "t": 308.6666666666667, + "r": 524.0, + "b": 285.3333333333333, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 10 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Figure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/527", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/896" - }, + "page_no": 8, + "bbox": { + "l": 108.0, + "t": 266.424, + "r": 504.00300000000004, + "b": 225.14499999999998, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 393 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "KDD '22, August 14-18, 2022, Washington, DC, USA", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/529", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/897" - }, + "page_no": 9, + "bbox": { + "l": 88.676, + "t": 598.985, + "r": 186.95, + "b": 593.669, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 48 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/530", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/898" - }, + "page_no": 9, + "bbox": { + "l": 190.471, + "t": 598.985, + "r": 346.254, + "b": 593.669, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 81 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as %", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/531", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/899" - }, + "page_no": 9, + "bbox": { + "l": 88.525, + "t": 586.821, + "r": 346.401, + "b": 580.676, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 123 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/532", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/900" - }, + "page_no": 9, + "bbox": { + "l": 88.676, + "t": 575.628, + "r": 301.135, + "b": 569.484, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 99 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/533", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/901" - }, + "page_no": 9, + "bbox": { + "l": 88.676, + "t": 581.225, + "r": 346.254, + "b": 575.08, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 124 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n|----------------|---------|--------------|--------------|--------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|\n| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten |\n| Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a |\n| Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 |\n| Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a |\n| List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 |\n| Page-footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 |\n| Page-header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 |\n| Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 |\n| Section-header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 |\n| Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 |\n| Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 |\n| Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 |\n| Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 |", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/tables/3", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [ { - "$ref": "#/texts/902" - }, + "page_no": 9, + "bbox": { + "l": 110.8309097290039, + "t": 560.6356811523438, + "r": 323.92962646484375, + "b": 477.741455078125, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "include publication repositories such as arXiv", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/695", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/903" - }, + "page_no": 9, + "bbox": { + "l": 223.57, + "t": 471.407, + "r": 306.847, + "b": 465.079, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 46 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row \"Total\") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/696", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ { - "$ref": "#/texts/904" + "page_no": 9, + "bbox": { + "l": 335.152, + "t": 573.216, + "r": 521.726, + "b": 570.514, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 146 + ] }, { - "$ref": "#/texts/905" + "page_no": 9, + "bbox": { + "l": 335.152, + "t": 573.216, + "r": 521.726, + "b": 570.514, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 147, + 294 + ] } - ], + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "annotated pages, from which we obtain accuracy ranges.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/697", + "parent": { + "$ref": "#/body" + }, + "children": [], "content_layer": "body", - "meta": {}, - "label": "picture", + "label": "text", "prov": [ { "page_no": 9, "bbox": { - "l": 334.4932861328125, - "t": 558.5665130615234, - "r": 544.7938842773438, - "b": 414.31744384765625, + "l": 335.152, + "t": 564.097, + "r": 408.543, + "b": 561.395, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, - 0 + 54 ] } ] @@ -7679,249 +4893,6 @@ } } }, - { - "text": "In this image there is a table with some text on it.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/pictures/12", - "parent": { - "$ref": "#/body" - }, - "children": [ - { - "$ref": "#/texts/915" - }, - { - "$ref": "#/texts/916" - }, - { - "$ref": "#/texts/917" - }, - { - "$ref": "#/texts/918" - }, - { - "$ref": "#/texts/919" - }, - { - "$ref": "#/texts/920" - }, - { - "$ref": "#/texts/921" - }, - { - "$ref": "#/texts/922" - }, - { - "$ref": "#/texts/923" - }, - { - "$ref": "#/texts/924" - }, - { - "$ref": "#/texts/925" - }, - { - "$ref": "#/texts/926" - }, - { - "$ref": "#/texts/927" - }, - { - "$ref": "#/texts/928" - }, - { - "$ref": "#/texts/929" - }, - { - "$ref": "#/texts/930" - }, - { - "$ref": "#/texts/931" - }, - { - "$ref": "#/texts/932" - }, - { - "$ref": "#/texts/933" - }, - { - "$ref": "#/texts/934" - }, - { - "$ref": "#/texts/935" - }, - { - "$ref": "#/texts/936" - }, - { - "$ref": "#/texts/937" - }, - { - "$ref": "#/texts/938" - }, - { - "$ref": "#/texts/939" - }, - { - "$ref": "#/texts/940" - }, - { - "$ref": "#/texts/941" - }, - { - "$ref": "#/texts/942" - }, - { - "$ref": "#/texts/943" - }, - { - "$ref": "#/texts/944" - }, - { - "$ref": "#/texts/945" - }, - { - "$ref": "#/texts/946" - }, - { - "$ref": "#/texts/947" - }, - { - "$ref": "#/texts/948" - }, - { - "$ref": "#/texts/949" - }, - { - "$ref": "#/texts/950" - }, - { - "$ref": "#/texts/951" - }, - { - "$ref": "#/texts/952" - }, - { - "$ref": "#/texts/953" - }, - { - "$ref": "#/texts/954" - }, - { - "$ref": "#/texts/955" - }, - { - "$ref": "#/texts/956" - }, - { - "$ref": "#/texts/957" - }, - { - "$ref": "#/texts/958" - }, - { - "$ref": "#/texts/959" - }, - { - "$ref": "#/texts/960" - }, - { - "$ref": "#/texts/961" - }, - { - "$ref": "#/texts/962" - }, - { - "$ref": "#/texts/963" - }, - { - "$ref": "#/texts/964" - }, - { - "$ref": "#/texts/965" - }, - { - "$ref": "#/texts/966" - }, - { - "$ref": "#/texts/967" - }, - { - "$ref": "#/texts/968" - }, - { - "$ref": "#/texts/969" - }, - { - "$ref": "#/texts/970" - }, - { - "$ref": "#/texts/971" - }, - { - "$ref": "#/texts/972" - }, - { - "$ref": "#/texts/973" - }, - { - "$ref": "#/texts/974" - }, - { - "$ref": "#/texts/975" - }, - { - "$ref": "#/texts/976" - }, - { - "$ref": "#/texts/977" - }, - { - "$ref": "#/texts/978" - }, - { - "$ref": "#/texts/979" - }, - { - "$ref": "#/texts/980" - } - ], - "content_layer": "body", - "meta": {}, - "label": "picture", - "prov": [ - { - "page_no": 9, - "bbox": { - "l": 108.79005432128906, - "t": 467.1181335449219, - "r": 329.1195068359375, - "b": 308.97198486328125, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, { "text": "we distributed the annotation workload and performed continuous be annotated. We refrained from class labels that are very specific", "meta": { diff --git a/test/data/doc/2408.09869v3_enriched.gt.md b/test/data/doc/2408.09869v3_enriched.gt.md index 235d36c3..87f8951f 100644 --- a/test/data/doc/2408.09869v3_enriched.gt.md +++ b/test/data/doc/2408.09869v3_enriched.gt.md @@ -8,8 +8,6 @@ In this image, we can see some text and images. Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible. -In this image, we can see some text and images. - licensing (e.g. pymupdf [7]), poor speed or unrecoverable quality issues, such as merged text cells across far-apart text tokens or table columns (pypdfium, PyPDF) [15, 14]. diff --git a/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md b/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md index 8170fe3c..ea49a66c 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md +++ b/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md @@ -2,8 +2,6 @@ In this image we can see a cartoon image of a duck holding a paper. -In this image we can see a cartoon image of a duck holding a paper. - Version 1.0 diff --git a/test/data/doc/2408.09869v3_enriched_p1_mark_meta_true.gt.md b/test/data/doc/2408.09869v3_enriched_p1_mark_meta_true.gt.md new file mode 100644 index 00000000..3ba2f46a --- /dev/null +++ b/test/data/doc/2408.09869v3_enriched_p1_mark_meta_true.gt.md @@ -0,0 +1,87 @@ +# Docling Technical Report + +[Description] In this image we can see a cartoon image of a duck holding a paper. + + + +Version 1.0 + +Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar + +AI4K Group, IBM Research R¨ uschlikon, Switzerland + +## Abstract + +This technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source package for PDF document conversion. It is powered by state-of-the-art specialized AI models for layout analysis (DocLayNet) and table structure recognition (TableFormer), and runs efficiently on commodity hardware in a small resource budget. The code interface allows for easy extensibility and addition of new features and models. + +## 1 Introduction + +Converting PDF documents back into a machine-processable format has been a major challenge for decades due to their huge variability in formats, weak standardization and printing-optimized characteristic, which discards most structural features and metadata. With the advent of LLMs and popular application patterns such as retrieval-augmented generation (RAG), leveraging the rich content embedded in PDFs has become ever more relevant. In the past decade, several powerful document understanding solutions have emerged on the market, most of which are commercial software, cloud offerings [3] and most recently, multi-modal vision-language models. As of today, only a handful of open-source tools cover PDF conversion, leaving a significant feature and quality gap to proprietary solutions. + +With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models. + +[Description] In this image, we can see some text and images. + +torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report. + +[Docling Legacy Misc] {'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} + +summary: Typical Docling setup runtime characterization. +type: performance data + +Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads. + +| CPU | Thread budget | native backend | native backend | native backend | pypdfium backend | pypdfium backend | pypdfium backend | +|----------------------------------|-----------------|------------------|------------------|------------------|--------------------|--------------------|--------------------| +| | | TTS | Pages/s | Mem | TTS | Pages/s | Mem | +| Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB | +| (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB | + +## 5 Applications + +Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets. + +## 6 Future work and contributions + +Docling is designed to allow easy extension of the model library and pipelines. In the future, we plan to extend Docling with several more models, such as a figure-classifier model, an equationrecognition model, a code-recognition model and more. This will help improve the quality of conversion for specific types of content, as well as augment extracted document metadata with additional information. Further investment into testing and optimizing GPU acceleration as well as improving the Docling-native PDF backend are on our roadmap, too. + +We encourage everyone to propose or implement additional features and models, and will gladly take your inputs and contributions under review . The codebase of Docling is open for use and contribution, under the MIT license agreement and in alignment with our contributing guidelines included in the Docling repository. If you use Docling in your projects, please consider citing this technical report. + +## References + +- [1] J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/ JaidedAI/EasyOCR , 2024. Version: 1.7.0. +- [2] J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. Voznesensky, B. Bao, P. Bell, D. Berard, E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison, W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. Lazos, M. Lezcano, Y. Liang, J. Liang, Y. Lu, C. Luk, B. Maher, Y. Pan, C. Puhrsch, M. Reso, M. Saroufim, M. Y. Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang, X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S. Chintala. Pytorch 2: Faster + +[Description] In this image there is a table with some text on it. + +[Description] In this image we can see a text. + +[Description] In this image I can see the cover of the book. + +[Description] In this image there is a paper with some text on it. + +[Description] In this image, we can see a table with some text. + +[Description] The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. + +The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. + +### Analysis: + +#### Training Program: +- **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. +- **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%. + +[Description] The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. + +The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. + +The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A" + +[Description] In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318. + +[Description] In this image I can see a blue circle. + +[Description] A table with different columns and rows. + +[Description] In this image there is a text in the middle. diff --git a/test/data/doc/2408.09869v3_enriched_p1_use_legacy_annotations_true_mark_annotations_true.gt.md b/test/data/doc/2408.09869v3_enriched_p1_use_legacy_annotations_true_mark_annotations_true.gt.md new file mode 100644 index 00000000..c08732f2 --- /dev/null +++ b/test/data/doc/2408.09869v3_enriched_p1_use_legacy_annotations_true_mark_annotations_true.gt.md @@ -0,0 +1,49 @@ +# Docling Technical Report + +In this image we can see a cartoon image of a duck holding a paper. + + + +Version 1.0 + +Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar + +AI4K Group, IBM Research R¨ uschlikon, Switzerland + +## Abstract + +This technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source package for PDF document conversion. It is powered by state-of-the-art specialized AI models for layout analysis (DocLayNet) and table structure recognition (TableFormer), and runs efficiently on commodity hardware in a small resource budget. The code interface allows for easy extensibility and addition of new features and models. + +## 1 Introduction + +Converting PDF documents back into a machine-processable format has been a major challenge for decades due to their huge variability in formats, weak standardization and printing-optimized characteristic, which discards most structural features and metadata. With the advent of LLMs and popular application patterns such as retrieval-augmented generation (RAG), leveraging the rich content embedded in PDFs has become ever more relevant. In the past decade, several powerful document understanding solutions have emerged on the market, most of which are commercial software, cloud offerings [3] and most recently, multi-modal vision-language models. As of today, only a handful of open-source tools cover PDF conversion, leaving a significant feature and quality gap to proprietary solutions. + +With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models. + +torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report. + +summary: Typical Docling setup runtime characterization. +type: performance data + +Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads. + +| CPU | Thread budget | native backend | native backend | native backend | pypdfium backend | pypdfium backend | pypdfium backend | +|----------------------------------|-----------------|------------------|------------------|------------------|--------------------|--------------------|--------------------| +| | | TTS | Pages/s | Mem | TTS | Pages/s | Mem | +| Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB | +| (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB | + +## 5 Applications + +Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets. + +## 6 Future work and contributions + +Docling is designed to allow easy extension of the model library and pipelines. In the future, we plan to extend Docling with several more models, such as a figure-classifier model, an equationrecognition model, a code-recognition model and more. This will help improve the quality of conversion for specific types of content, as well as augment extracted document metadata with additional information. Further investment into testing and optimizing GPU acceleration as well as improving the Docling-native PDF backend are on our roadmap, too. + +We encourage everyone to propose or implement additional features and models, and will gladly take your inputs and contributions under review . The codebase of Docling is open for use and contribution, under the MIT license agreement and in alignment with our contributing guidelines included in the Docling repository. If you use Docling in your projects, please consider citing this technical report. + +## References + +- [1] J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/ JaidedAI/EasyOCR , 2024. Version: 1.7.0. +- [2] J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. Voznesensky, B. Bao, P. Bell, D. Berard, E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison, W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. Lazos, M. Lezcano, Y. Liang, J. Liang, Y. Lu, C. Luk, B. Maher, Y. Pan, C. Puhrsch, M. Reso, M. Saroufim, M. Y. Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang, X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S. Chintala. Pytorch 2: Faster diff --git a/test/data/doc/barchart.gt.md b/test/data/doc/barchart.gt.md index e8b988c7..0adc5569 100644 --- a/test/data/doc/barchart.gt.md +++ b/test/data/doc/barchart.gt.md @@ -1,7 +1,5 @@ Bar chart -bar chart - | Number of impellers | single-frequency | multi-frequency | diff --git a/test/test_serialization.py b/test/test_serialization.py index a8ce96e4..80b051b3 100644 --- a/test/test_serialization.py +++ b/test/test_serialization.py @@ -302,7 +302,7 @@ def test_md_mark_annotations_false(): ) -def test_md_mark_annotations_true(): +def test_md_mark_meta_true(): src = Path("./test/data/doc/2408.09869v3_enriched.json") doc = DoclingDocument.load_from_json(src) @@ -310,6 +310,27 @@ def test_md_mark_annotations_true(): doc=doc, table_serializer=CustomAnnotationTableSerializer(), params=MarkdownParams( + include_annotations=True, + mark_meta=True, + pages={1, 5}, + ), + ) + actual = ser.serialize().text + verify( + exp_file=src.parent / f"{src.stem}_p1_mark_meta_true.gt.md", + actual=actual, + ) + + +def test_md_use_legacy_annotations_true_mark_annotations_true(): + src = Path("./test/data/doc/2408.09869v3_enriched.json") + doc = DoclingDocument.load_from_json(src) + + ser = MarkdownDocSerializer( + doc=doc, + table_serializer=CustomAnnotationTableSerializer(), + params=MarkdownParams( + use_legacy_annotations=True, include_annotations=True, mark_annotations=True, pages={1, 5}, @@ -317,7 +338,8 @@ def test_md_mark_annotations_true(): ) actual = ser.serialize().text verify( - exp_file=src.parent / f"{src.stem}_p1_mark_annotations_true.gt.md", + exp_file=src.parent + / f"{src.stem}_p1_use_legacy_annotations_true_mark_annotations_true.gt.md", actual=actual, ) From e71e97ddfa6438e9963a6826aedf27a992c11e82 Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Tue, 28 Oct 2025 20:52:26 +0100 Subject: [PATCH 13/22] remove old file Signed-off-by: Panos Vagenas --- ...v3_enriched_p1_mark_annotations_true.gt.md | 89 ------------------- 1 file changed, 89 deletions(-) delete mode 100644 test/data/doc/2408.09869v3_enriched_p1_mark_annotations_true.gt.md diff --git a/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_true.gt.md b/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_true.gt.md deleted file mode 100644 index d1d5e8b5..00000000 --- a/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_true.gt.md +++ /dev/null @@ -1,89 +0,0 @@ -# Docling Technical Report - -In this image we can see a cartoon image of a duck holding a paper. - -In this image we can see a cartoon image of a duck holding a paper. - - - -Version 1.0 - -Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar - -AI4K Group, IBM Research R¨ uschlikon, Switzerland - -## Abstract - -This technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source package for PDF document conversion. It is powered by state-of-the-art specialized AI models for layout analysis (DocLayNet) and table structure recognition (TableFormer), and runs efficiently on commodity hardware in a small resource budget. The code interface allows for easy extensibility and addition of new features and models. - -## 1 Introduction - -Converting PDF documents back into a machine-processable format has been a major challenge for decades due to their huge variability in formats, weak standardization and printing-optimized characteristic, which discards most structural features and metadata. With the advent of LLMs and popular application patterns such as retrieval-augmented generation (RAG), leveraging the rich content embedded in PDFs has become ever more relevant. In the past decade, several powerful document understanding solutions have emerged on the market, most of which are commercial software, cloud offerings [3] and most recently, multi-modal vision-language models. As of today, only a handful of open-source tools cover PDF conversion, leaving a significant feature and quality gap to proprietary solutions. - -With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models. - -In this image, we can see some text and images. - -torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report. - -{'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} - -summary: Typical Docling setup runtime characterization. -type: performance data - -Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads. - -| CPU | Thread budget | native backend | native backend | native backend | pypdfium backend | pypdfium backend | pypdfium backend | -|----------------------------------|-----------------|------------------|------------------|------------------|--------------------|--------------------|--------------------| -| | | TTS | Pages/s | Mem | TTS | Pages/s | Mem | -| Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB | -| (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB | - -## 5 Applications - -Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets. - -## 6 Future work and contributions - -Docling is designed to allow easy extension of the model library and pipelines. In the future, we plan to extend Docling with several more models, such as a figure-classifier model, an equationrecognition model, a code-recognition model and more. This will help improve the quality of conversion for specific types of content, as well as augment extracted document metadata with additional information. Further investment into testing and optimizing GPU acceleration as well as improving the Docling-native PDF backend are on our roadmap, too. - -We encourage everyone to propose or implement additional features and models, and will gladly take your inputs and contributions under review . The codebase of Docling is open for use and contribution, under the MIT license agreement and in alignment with our contributing guidelines included in the Docling repository. If you use Docling in your projects, please consider citing this technical report. - -## References - -- [1] J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/ JaidedAI/EasyOCR , 2024. Version: 1.7.0. -- [2] J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. Voznesensky, B. Bao, P. Bell, D. Berard, E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison, W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. Lazos, M. Lezcano, Y. Liang, J. Liang, Y. Lu, C. Luk, B. Maher, Y. Pan, C. Puhrsch, M. Reso, M. Saroufim, M. Y. Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang, X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S. Chintala. Pytorch 2: Faster - -In this image there is a table with some text on it. - -In this image we can see a text. - -In this image I can see the cover of the book. - -In this image there is a paper with some text on it. - -In this image, we can see a table with some text. - -The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. - -The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. - -### Analysis: - -#### Training Program: -- **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. -- **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%. - -The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. - -The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. - -The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A" - -In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318. - -In this image I can see a blue circle. - -A table with different columns and rows. - -In this image there is a text in the middle. From a1cacfd702746ed85440be4d08ccbce6a976c155 Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Tue, 28 Oct 2025 21:20:47 +0100 Subject: [PATCH 14/22] fix item used in get_parts for meta ser Signed-off-by: Panos Vagenas --- docling_core/transforms/serializer/common.py | 5 ++- test/data/doc/2408.09869v3_enriched.gt.md | 38 ------------------- ...nriched_p1_include_annotations_false.gt.md | 36 ------------------ ...3_enriched_p1_mark_annotations_false.gt.md | 36 ------------------ ...8.09869v3_enriched_p1_mark_meta_true.gt.md | 36 ------------------ 5 files changed, 3 insertions(+), 148 deletions(-) diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py index 86fc120c..c2d74f31 100644 --- a/docling_core/transforms/serializer/common.py +++ b/docling_core/transforms/serializer/common.py @@ -454,8 +454,9 @@ def get_parts( else: my_visited.add(node.self_ref) - if not params.use_legacy_annotations and ( - not item or item.self_ref not in self.get_excluded_refs(**kwargs) + if ( + not params.use_legacy_annotations + and node.self_ref not in self.get_excluded_refs(**kwargs) ): part = self.serialize_meta( item=node, diff --git a/test/data/doc/2408.09869v3_enriched.gt.md b/test/data/doc/2408.09869v3_enriched.gt.md index 87f8951f..a8604726 100644 --- a/test/data/doc/2408.09869v3_enriched.gt.md +++ b/test/data/doc/2408.09869v3_enriched.gt.md @@ -1,5 +1,3 @@ -In this image we can see a cartoon image of a duck holding a paper. - @@ -58,8 +56,6 @@ Establishing GPU acceleration support for the AI models is currently work-in-pro -{'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} - machine learning through dynamic python bytecode transformation and graph compilation. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2 (ASPLOS '24) . ACM, 4 2024. doi: 10.1145/3620665.3640366. URL https://pytorch.org/assets/pytorch2-2.pdf . @@ -81,40 +77,6 @@ machine learning through dynamic python bytecode transformation and graph compil -In this image there is a table with some text on it. - -In this image we can see a text. - -In this image I can see the cover of the book. - -In this image there is a paper with some text on it. - -In this image, we can see a table with some text. - -The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. - -The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. - -### Analysis: - -#### Training Program: -- **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. -- **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%. - - -The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. - -The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. - -The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A" - -In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318. - -In this image I can see a blue circle. - -A table with different columns and rows. - -In this image there is a text in the middle. diff --git a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.md b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.md index 6f2233e8..fe55c055 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.md +++ b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.md @@ -20,8 +20,6 @@ Converting PDF documents back into a machine-processable format has been a major With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models. -In this image, we can see some text and images. - torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report. {'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} @@ -48,37 +46,3 @@ We encourage everyone to propose or implement additional features and models, an - [1] J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/ JaidedAI/EasyOCR , 2024. Version: 1.7.0. - [2] J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. Voznesensky, B. Bao, P. Bell, D. Berard, E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison, W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. Lazos, M. Lezcano, Y. Liang, J. Liang, Y. Lu, C. Luk, B. Maher, Y. Pan, C. Puhrsch, M. Reso, M. Saroufim, M. Y. Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang, X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S. Chintala. Pytorch 2: Faster - -In this image there is a table with some text on it. - -In this image we can see a text. - -In this image I can see the cover of the book. - -In this image there is a paper with some text on it. - -In this image, we can see a table with some text. - -The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. - -The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. - -### Analysis: - -#### Training Program: -- **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. -- **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%. - -The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. - -The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. - -The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A" - -In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318. - -In this image I can see a blue circle. - -A table with different columns and rows. - -In this image there is a text in the middle. diff --git a/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md b/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md index ea49a66c..9a894c88 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md +++ b/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md @@ -20,8 +20,6 @@ Converting PDF documents back into a machine-processable format has been a major With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models. -In this image, we can see some text and images. - torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report. {'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} @@ -51,37 +49,3 @@ We encourage everyone to propose or implement additional features and models, an - [1] J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/ JaidedAI/EasyOCR , 2024. Version: 1.7.0. - [2] J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. Voznesensky, B. Bao, P. Bell, D. Berard, E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison, W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. Lazos, M. Lezcano, Y. Liang, J. Liang, Y. Lu, C. Luk, B. Maher, Y. Pan, C. Puhrsch, M. Reso, M. Saroufim, M. Y. Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang, X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S. Chintala. Pytorch 2: Faster - -In this image there is a table with some text on it. - -In this image we can see a text. - -In this image I can see the cover of the book. - -In this image there is a paper with some text on it. - -In this image, we can see a table with some text. - -The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. - -The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. - -### Analysis: - -#### Training Program: -- **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. -- **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%. - -The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. - -The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. - -The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A" - -In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318. - -In this image I can see a blue circle. - -A table with different columns and rows. - -In this image there is a text in the middle. diff --git a/test/data/doc/2408.09869v3_enriched_p1_mark_meta_true.gt.md b/test/data/doc/2408.09869v3_enriched_p1_mark_meta_true.gt.md index 3ba2f46a..3f8a9266 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_mark_meta_true.gt.md +++ b/test/data/doc/2408.09869v3_enriched_p1_mark_meta_true.gt.md @@ -20,8 +20,6 @@ Converting PDF documents back into a machine-processable format has been a major With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models. -[Description] In this image, we can see some text and images. - torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report. [Docling Legacy Misc] {'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} @@ -51,37 +49,3 @@ We encourage everyone to propose or implement additional features and models, an - [1] J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/ JaidedAI/EasyOCR , 2024. Version: 1.7.0. - [2] J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. Voznesensky, B. Bao, P. Bell, D. Berard, E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison, W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. Lazos, M. Lezcano, Y. Liang, J. Liang, Y. Lu, C. Luk, B. Maher, Y. Pan, C. Puhrsch, M. Reso, M. Saroufim, M. Y. Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang, X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S. Chintala. Pytorch 2: Faster - -[Description] In this image there is a table with some text on it. - -[Description] In this image we can see a text. - -[Description] In this image I can see the cover of the book. - -[Description] In this image there is a paper with some text on it. - -[Description] In this image, we can see a table with some text. - -[Description] The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. - -The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. - -### Analysis: - -#### Training Program: -- **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. -- **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%. - -[Description] The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. - -The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. - -The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A" - -[Description] In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318. - -[Description] In this image I can see a blue circle. - -[Description] A table with different columns and rows. - -[Description] In this image there is a text in the middle. From 3287664d087fbda7ab538754e96f029306a94043 Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Tue, 28 Oct 2025 21:27:44 +0100 Subject: [PATCH 15/22] serialize GroupItem meta prior to content, DocItem meta after content Signed-off-by: Panos Vagenas --- docling_core/transforms/serializer/common.py | 14 +++++++++++--- test/data/doc/2408.09869v3_enriched.gt.md | 4 ++-- ...9v3_enriched_p1_include_annotations_false.gt.md | 8 ++++---- ...9869v3_enriched_p1_mark_annotations_false.gt.md | 8 ++++---- .../2408.09869v3_enriched_p1_mark_meta_true.gt.md | 8 ++++---- test/data/doc/barchart.gt.md | 4 ++-- test/data/doc/dummy_doc.yaml.md | 8 ++++---- test/data/doc/group_with_metadata_default.md | 4 ++-- test/data/doc/group_with_metadata_marked.md | 4 ++-- 9 files changed, 35 insertions(+), 27 deletions(-) diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py index c2d74f31..2946606d 100644 --- a/docling_core/transforms/serializer/common.py +++ b/docling_core/transforms/serializer/common.py @@ -46,6 +46,7 @@ FloatingItem, Formatting, FormItem, + GroupItem, InlineGroup, KeyValueItem, ListGroup, @@ -454,17 +455,20 @@ def get_parts( else: my_visited.add(node.self_ref) + meta_part = create_ser_result() + node_is_group = isinstance(node, GroupItem) if ( not params.use_legacy_annotations and node.self_ref not in self.get_excluded_refs(**kwargs) ): - part = self.serialize_meta( + meta_part = self.serialize_meta( item=node, level=lvl, **kwargs, ) - if part.text: - parts.append(part) + if meta_part.text and node_is_group: + # for GroupItems add meta prior to content + parts.append(meta_part) if params.include_non_meta: part = self.serialize( @@ -477,6 +481,10 @@ def get_parts( if part.text: parts.append(part) + if meta_part.text and not node_is_group: + # for DocItems add meta after content + parts.append(meta_part) + return parts @override diff --git a/test/data/doc/2408.09869v3_enriched.gt.md b/test/data/doc/2408.09869v3_enriched.gt.md index a8604726..be76da33 100644 --- a/test/data/doc/2408.09869v3_enriched.gt.md +++ b/test/data/doc/2408.09869v3_enriched.gt.md @@ -2,12 +2,12 @@ -In this image, we can see some text and images. - Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible. +In this image, we can see some text and images. + licensing (e.g. pymupdf [7]), poor speed or unrecoverable quality issues, such as merged text cells across far-apart text tokens or table columns (pypdfium, PyPDF) [15, 14]. We therefore decided to provide multiple backend choices, and additionally open-source a custombuilt PDF parser, which is based on the low-level qpdf [4] library. It is made available in a separate package named docling-parse and powers the default PDF backend in Docling. As an alternative, we provide a PDF backend relying on pypdfium , which may be a safe backup choice in certain cases, e.g. if issues are seen with particular font encodings. diff --git a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.md b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.md index fe55c055..3ba308a1 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.md +++ b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.md @@ -1,9 +1,9 @@ # Docling Technical Report -In this image we can see a cartoon image of a duck holding a paper. - +In this image we can see a cartoon image of a duck holding a paper. + Version 1.0 Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar @@ -22,8 +22,6 @@ With Docling , we open-source a very capable and efficient document conversion t torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report. -{'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} - Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads. | CPU | Thread budget | native backend | native backend | native backend | pypdfium backend | pypdfium backend | pypdfium backend | @@ -32,6 +30,8 @@ Table 1: Runtime characteristics of Docling with the standard model pipeline and | Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB | | (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB | +{'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} + ## 5 Applications Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets. diff --git a/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md b/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md index 9a894c88..70092b76 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md +++ b/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md @@ -1,9 +1,9 @@ # Docling Technical Report -In this image we can see a cartoon image of a duck holding a paper. - +In this image we can see a cartoon image of a duck holding a paper. + Version 1.0 Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar @@ -22,8 +22,6 @@ With Docling , we open-source a very capable and efficient document conversion t torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report. -{'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} - summary: Typical Docling setup runtime characterization. type: performance data @@ -35,6 +33,8 @@ Table 1: Runtime characteristics of Docling with the standard model pipeline and | Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB | | (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB | +{'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} + ## 5 Applications Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets. diff --git a/test/data/doc/2408.09869v3_enriched_p1_mark_meta_true.gt.md b/test/data/doc/2408.09869v3_enriched_p1_mark_meta_true.gt.md index 3f8a9266..92ebd61a 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_mark_meta_true.gt.md +++ b/test/data/doc/2408.09869v3_enriched_p1_mark_meta_true.gt.md @@ -1,9 +1,9 @@ # Docling Technical Report -[Description] In this image we can see a cartoon image of a duck holding a paper. - +[Description] In this image we can see a cartoon image of a duck holding a paper. + Version 1.0 Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar @@ -22,8 +22,6 @@ With Docling , we open-source a very capable and efficient document conversion t torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report. -[Docling Legacy Misc] {'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} - summary: Typical Docling setup runtime characterization. type: performance data @@ -35,6 +33,8 @@ Table 1: Runtime characteristics of Docling with the standard model pipeline and | Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB | | (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB | +[Docling Legacy Misc] {'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} + ## 5 Applications Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets. diff --git a/test/data/doc/barchart.gt.md b/test/data/doc/barchart.gt.md index 0adc5569..323576d8 100644 --- a/test/data/doc/barchart.gt.md +++ b/test/data/doc/barchart.gt.md @@ -1,5 +1,3 @@ -Bar chart - | Number of impellers | single-frequency | multi-frequency | @@ -10,3 +8,5 @@ Bar chart | 4 | 0.14 | 0.26 | | 5 | 0.16 | 0.25 | | 6 | 0.24 | 0.24 | + +Bar chart diff --git a/test/data/doc/dummy_doc.yaml.md b/test/data/doc/dummy_doc.yaml.md index bd4e6b23..c018c3f4 100644 --- a/test/data/doc/dummy_doc.yaml.md +++ b/test/data/doc/dummy_doc.yaml.md @@ -1,5 +1,9 @@ # DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis +Figure 1: Four examples of complex page layouts across different document categories + + + ... Bar chart @@ -8,10 +12,6 @@ CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 {'myanalysis': {'prediction': 'abc'}, 'something_else': {'text': 'aaa'}} -Figure 1: Four examples of complex page layouts across different document categories - - - A description annotation for this table. {'foo': 'bar'} diff --git a/test/data/doc/group_with_metadata_default.md b/test/data/doc/group_with_metadata_default.md index 157acfa4..2883e717 100644 --- a/test/data/doc/group_with_metadata_default.md +++ b/test/data/doc/group_with_metadata_default.md @@ -8,10 +8,10 @@ This is some introductory text. This section talks about foo. -This paragraph provides more details about foo. - Regarding foo... +This paragraph provides more details about foo. + Here some foo specifics are listed. 1. lorem diff --git a/test/data/doc/group_with_metadata_marked.md b/test/data/doc/group_with_metadata_marked.md index 5393569e..95dd8fb7 100644 --- a/test/data/doc/group_with_metadata_marked.md +++ b/test/data/doc/group_with_metadata_marked.md @@ -8,10 +8,10 @@ This is some introductory text. [Summary] This section talks about foo. -[Summary] This paragraph provides more details about foo. - Regarding foo... +[Summary] This paragraph provides more details about foo. + [Summary] Here some foo specifics are listed. 1. lorem From 5fc98e3362079c9999f66b509a44e67e3f517232 Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Tue, 28 Oct 2025 21:47:48 +0100 Subject: [PATCH 16/22] restore ser order for all nodeitems Signed-off-by: Panos Vagenas --- docling_core/transforms/serializer/common.py | 14 +++----------- test/data/doc/2408.09869v3_enriched.gt.md | 4 ++-- ...9v3_enriched_p1_include_annotations_false.gt.md | 4 ---- ...9869v3_enriched_p1_mark_annotations_false.gt.md | 6 ++---- .../2408.09869v3_enriched_p1_mark_meta_true.gt.md | 8 ++++---- test/data/doc/barchart.gt.md | 4 ++-- test/data/doc/dummy_doc.yaml.md | 8 ++++---- test/data/doc/group_with_metadata_default.md | 4 ++-- test/data/doc/group_with_metadata_marked.md | 4 ++-- test/test_serialization.py | 9 +++++---- 10 files changed, 26 insertions(+), 39 deletions(-) diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py index 2946606d..c2d74f31 100644 --- a/docling_core/transforms/serializer/common.py +++ b/docling_core/transforms/serializer/common.py @@ -46,7 +46,6 @@ FloatingItem, Formatting, FormItem, - GroupItem, InlineGroup, KeyValueItem, ListGroup, @@ -455,20 +454,17 @@ def get_parts( else: my_visited.add(node.self_ref) - meta_part = create_ser_result() - node_is_group = isinstance(node, GroupItem) if ( not params.use_legacy_annotations and node.self_ref not in self.get_excluded_refs(**kwargs) ): - meta_part = self.serialize_meta( + part = self.serialize_meta( item=node, level=lvl, **kwargs, ) - if meta_part.text and node_is_group: - # for GroupItems add meta prior to content - parts.append(meta_part) + if part.text: + parts.append(part) if params.include_non_meta: part = self.serialize( @@ -481,10 +477,6 @@ def get_parts( if part.text: parts.append(part) - if meta_part.text and not node_is_group: - # for DocItems add meta after content - parts.append(meta_part) - return parts @override diff --git a/test/data/doc/2408.09869v3_enriched.gt.md b/test/data/doc/2408.09869v3_enriched.gt.md index be76da33..a8604726 100644 --- a/test/data/doc/2408.09869v3_enriched.gt.md +++ b/test/data/doc/2408.09869v3_enriched.gt.md @@ -2,12 +2,12 @@ +In this image, we can see some text and images. + Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible. -In this image, we can see some text and images. - licensing (e.g. pymupdf [7]), poor speed or unrecoverable quality issues, such as merged text cells across far-apart text tokens or table columns (pypdfium, PyPDF) [15, 14]. We therefore decided to provide multiple backend choices, and additionally open-source a custombuilt PDF parser, which is based on the low-level qpdf [4] library. It is made available in a separate package named docling-parse and powers the default PDF backend in Docling. As an alternative, we provide a PDF backend relying on pypdfium , which may be a safe backup choice in certain cases, e.g. if issues are seen with particular font encodings. diff --git a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.md b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.md index 3ba308a1..dd345623 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.md +++ b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.md @@ -2,8 +2,6 @@ -In this image we can see a cartoon image of a duck holding a paper. - Version 1.0 Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar @@ -30,8 +28,6 @@ Table 1: Runtime characteristics of Docling with the standard model pipeline and | Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB | | (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB | -{'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} - ## 5 Applications Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets. diff --git a/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md b/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md index 70092b76..61f88f35 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md +++ b/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_false.gt.md @@ -1,9 +1,9 @@ # Docling Technical Report - - In this image we can see a cartoon image of a duck holding a paper. + + Version 1.0 Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar @@ -33,8 +33,6 @@ Table 1: Runtime characteristics of Docling with the standard model pipeline and | Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB | | (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB | -{'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} - ## 5 Applications Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets. diff --git a/test/data/doc/2408.09869v3_enriched_p1_mark_meta_true.gt.md b/test/data/doc/2408.09869v3_enriched_p1_mark_meta_true.gt.md index 92ebd61a..3f8a9266 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_mark_meta_true.gt.md +++ b/test/data/doc/2408.09869v3_enriched_p1_mark_meta_true.gt.md @@ -1,9 +1,9 @@ # Docling Technical Report - - [Description] In this image we can see a cartoon image of a duck holding a paper. + + Version 1.0 Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar @@ -22,6 +22,8 @@ With Docling , we open-source a very capable and efficient document conversion t torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report. +[Docling Legacy Misc] {'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} + summary: Typical Docling setup runtime characterization. type: performance data @@ -33,8 +35,6 @@ Table 1: Runtime characteristics of Docling with the standard model pipeline and | Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB | | (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB | -[Docling Legacy Misc] {'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} - ## 5 Applications Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets. diff --git a/test/data/doc/barchart.gt.md b/test/data/doc/barchart.gt.md index 323576d8..0adc5569 100644 --- a/test/data/doc/barchart.gt.md +++ b/test/data/doc/barchart.gt.md @@ -1,3 +1,5 @@ +Bar chart + | Number of impellers | single-frequency | multi-frequency | @@ -8,5 +10,3 @@ | 4 | 0.14 | 0.26 | | 5 | 0.16 | 0.25 | | 6 | 0.24 | 0.24 | - -Bar chart diff --git a/test/data/doc/dummy_doc.yaml.md b/test/data/doc/dummy_doc.yaml.md index c018c3f4..bd4e6b23 100644 --- a/test/data/doc/dummy_doc.yaml.md +++ b/test/data/doc/dummy_doc.yaml.md @@ -1,9 +1,5 @@ # DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis -Figure 1: Four examples of complex page layouts across different document categories - - - ... Bar chart @@ -12,6 +8,10 @@ CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 {'myanalysis': {'prediction': 'abc'}, 'something_else': {'text': 'aaa'}} +Figure 1: Four examples of complex page layouts across different document categories + + + A description annotation for this table. {'foo': 'bar'} diff --git a/test/data/doc/group_with_metadata_default.md b/test/data/doc/group_with_metadata_default.md index 2883e717..157acfa4 100644 --- a/test/data/doc/group_with_metadata_default.md +++ b/test/data/doc/group_with_metadata_default.md @@ -8,10 +8,10 @@ This is some introductory text. This section talks about foo. -Regarding foo... - This paragraph provides more details about foo. +Regarding foo... + Here some foo specifics are listed. 1. lorem diff --git a/test/data/doc/group_with_metadata_marked.md b/test/data/doc/group_with_metadata_marked.md index 95dd8fb7..5393569e 100644 --- a/test/data/doc/group_with_metadata_marked.md +++ b/test/data/doc/group_with_metadata_marked.md @@ -8,10 +8,10 @@ This is some introductory text. [Summary] This section talks about foo. -Regarding foo... - [Summary] This paragraph provides more details about foo. +Regarding foo... + [Summary] Here some foo specifics are listed. 1. lorem diff --git a/test/test_serialization.py b/test/test_serialization.py index 80b051b3..a8ebcdaa 100644 --- a/test/test_serialization.py +++ b/test/test_serialization.py @@ -263,7 +263,7 @@ def test_md_list_item_markers(): ) -def test_md_include_annotations_false(): +def test_md_legacy_include_annotations_false(): src = Path("./test/data/doc/2408.09869v3_enriched.json") doc = DoclingDocument.load_from_json(src) @@ -271,6 +271,7 @@ def test_md_include_annotations_false(): doc=doc, table_serializer=CustomAnnotationTableSerializer(), params=MarkdownParams( + use_legacy_annotations=True, include_annotations=False, pages={1, 5}, ), @@ -282,7 +283,7 @@ def test_md_include_annotations_false(): ) -def test_md_mark_annotations_false(): +def test_md_legacy_mark_annotations_false(): src = Path("./test/data/doc/2408.09869v3_enriched.json") doc = DoclingDocument.load_from_json(src) @@ -290,6 +291,7 @@ def test_md_mark_annotations_false(): doc=doc, table_serializer=CustomAnnotationTableSerializer(), params=MarkdownParams( + use_legacy_annotations=True, include_annotations=True, mark_annotations=False, pages={1, 5}, @@ -310,7 +312,6 @@ def test_md_mark_meta_true(): doc=doc, table_serializer=CustomAnnotationTableSerializer(), params=MarkdownParams( - include_annotations=True, mark_meta=True, pages={1, 5}, ), @@ -322,7 +323,7 @@ def test_md_mark_meta_true(): ) -def test_md_use_legacy_annotations_true_mark_annotations_true(): +def test_md_legacy_mark_annotations_true(): src = Path("./test/data/doc/2408.09869v3_enriched.json") doc = DoclingDocument.load_from_json(src) From 627ba6147e7b4281239389cfecbbf9c71c02b13d Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Tue, 28 Oct 2025 23:30:07 +0100 Subject: [PATCH 17/22] move meta serialization into DocSerializer.serialize() to maintain seamless chunking integration Signed-off-by: Panos Vagenas --- docling_core/transforms/serializer/common.py | 266 +- .../transforms/serializer/markdown.py | 19 + test/data/chunker/0_out_chunks.json | 5479 +++++++++++++---- test/data/chunker/0b_out_chunks.json | 5479 +++++++++++++---- 4 files changed, 8663 insertions(+), 2580 deletions(-) diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py index c2d74f31..c8288b9d 100644 --- a/docling_core/transforms/serializer/common.py +++ b/docling_core/transforms/serializer/common.py @@ -209,15 +209,6 @@ class CommonParams(BaseModel): default=False, description="Use legacy annotation serialization." ) - # allowed_meta_names: Optional[set[str]] = Field( - # default=None, - # description="Names of meta fields to include; if None, all fields will be included.", - # ) - # blocked_meta_names: set[str] = Field( - # default_factory=set, - # description="Names of meta fields to block; takes precedence over allowed_meta_names.", - # ) - def merge_with_patch(self, patch: dict[str, Any]) -> Self: """Create an instance by merging the provided patch dict on top of self.""" res = self.model_copy(update=patch) @@ -328,103 +319,130 @@ def serialize( ) -> SerializationResult: """Serialize a given node.""" my_visited: set[str] = visited if visited is not None else set() + parts: list[SerializationResult] = [] + delim: str = kwargs.get("delim", "\n") + my_params = self.params.model_copy(update=kwargs) my_kwargs = {**self.params.model_dump(), **kwargs} empty_res = create_ser_result() - if item is None or item == self.doc.body: - if self.doc.body.self_ref not in my_visited: - my_visited.add(self.doc.body.self_ref) - return self._serialize_body(**my_kwargs) - else: - return empty_res - my_visited.add(item.self_ref) + my_item = item or self.doc.body - ######## - # groups - ######## - if isinstance(item, ListGroup): - part = self.list_serializer.serialize( - item=item, - doc_serializer=self, - doc=self.doc, - list_level=list_level, - is_inline_scope=is_inline_scope, - visited=my_visited, - **my_kwargs, - ) - elif isinstance(item, InlineGroup): - part = self.inline_serializer.serialize( - item=item, - doc_serializer=self, - doc=self.doc, - list_level=list_level, - visited=my_visited, - **my_kwargs, - ) - ########### - # doc items - ########### - elif isinstance(item, TextItem): - if item.self_ref in self._captions_of_some_item: - # those captions will be handled by the floating item holding them - return empty_res + if my_item == self.doc.body: + if my_item.meta and not my_params.use_legacy_annotations: + meta_part = self.serialize_meta(item=my_item, **my_kwargs) + if meta_part.text: + parts.append(meta_part) + + if my_item.self_ref not in my_visited: + my_visited.add(my_item.self_ref) + part = self._serialize_body(**my_kwargs) + if part.text: + parts.append(part) + return create_ser_result( + text=delim.join([p.text for p in parts if p.text]), + span_source=parts, + ) else: - part = ( - self.text_serializer.serialize( - item=item, - doc_serializer=self, - doc=self.doc, - is_inline_scope=is_inline_scope, - visited=my_visited, - **my_kwargs, + return empty_res + + my_visited.add(my_item.self_ref) + + if my_item.meta and not my_params.use_legacy_annotations: + meta_part = self.serialize_meta(item=my_item, **my_kwargs) + if meta_part.text: + parts.append(meta_part) + + if my_params.include_non_meta: + ######## + # groups + ######## + if isinstance(my_item, ListGroup): + part = self.list_serializer.serialize( + item=my_item, + doc_serializer=self, + doc=self.doc, + list_level=list_level, + is_inline_scope=is_inline_scope, + visited=my_visited, + **my_kwargs, + ) + elif isinstance(my_item, InlineGroup): + part = self.inline_serializer.serialize( + item=my_item, + doc_serializer=self, + doc=self.doc, + list_level=list_level, + visited=my_visited, + **my_kwargs, + ) + ########### + # doc items + ########### + elif isinstance(my_item, TextItem): + if my_item.self_ref in self._captions_of_some_item: + # those captions will be handled by the floating item holding them + return empty_res + else: + part = ( + self.text_serializer.serialize( + item=my_item, + doc_serializer=self, + doc=self.doc, + is_inline_scope=is_inline_scope, + visited=my_visited, + **my_kwargs, + ) + if my_item.self_ref not in self.get_excluded_refs(**kwargs) + else empty_res ) - if item.self_ref not in self.get_excluded_refs(**kwargs) - else empty_res + elif isinstance(my_item, TableItem): + part = self.table_serializer.serialize( + item=my_item, + doc_serializer=self, + doc=self.doc, + visited=my_visited, + **my_kwargs, ) - elif isinstance(item, TableItem): - part = self.table_serializer.serialize( - item=item, - doc_serializer=self, - doc=self.doc, - visited=my_visited, - **my_kwargs, - ) - elif isinstance(item, PictureItem): - part = self.picture_serializer.serialize( - item=item, - doc_serializer=self, - doc=self.doc, - visited=my_visited, - **my_kwargs, - ) - elif isinstance(item, KeyValueItem): - part = self.key_value_serializer.serialize( - item=item, - doc_serializer=self, - doc=self.doc, - **my_kwargs, - ) - elif isinstance(item, FormItem): - part = self.form_serializer.serialize( - item=item, - doc_serializer=self, - doc=self.doc, - **my_kwargs, - ) - elif isinstance(item, _PageBreakNode): - part = _PageBreakSerResult( - text=self._create_page_break(node=item), - node=item, - ) - else: - part = self.fallback_serializer.serialize( - item=item, - doc_serializer=self, - doc=self.doc, - visited=my_visited, - **my_kwargs, - ) - return part + elif isinstance(my_item, PictureItem): + part = self.picture_serializer.serialize( + item=my_item, + doc_serializer=self, + doc=self.doc, + visited=my_visited, + **my_kwargs, + ) + elif isinstance(my_item, KeyValueItem): + part = self.key_value_serializer.serialize( + item=my_item, + doc_serializer=self, + doc=self.doc, + **my_kwargs, + ) + elif isinstance(my_item, FormItem): + part = self.form_serializer.serialize( + item=my_item, + doc_serializer=self, + doc=self.doc, + **my_kwargs, + ) + elif isinstance(my_item, _PageBreakNode): + part = _PageBreakSerResult( + text=self._create_page_break(node=my_item), + node=my_item, + ) + else: + part = self.fallback_serializer.serialize( + item=my_item, + doc_serializer=self, + doc=self.doc, + visited=my_visited, + **my_kwargs, + ) + parts.append(part) + + return create_ser_result( + text=delim.join([p.text for p in parts if p.text]), span_source=parts + ) # making some assumptions about the kwargs it can pass @override @@ -454,28 +472,15 @@ def get_parts( else: my_visited.add(node.self_ref) - if ( - not params.use_legacy_annotations - and node.self_ref not in self.get_excluded_refs(**kwargs) - ): - part = self.serialize_meta( - item=node, - level=lvl, - **kwargs, - ) - if part.text: - parts.append(part) - - if params.include_non_meta: - part = self.serialize( - item=node, - list_level=list_level, - is_inline_scope=is_inline_scope, - visited=my_visited, - **kwargs, - ) - if part.text: - parts.append(part) + part = self.serialize( + item=node, + list_level=list_level, + is_inline_scope=is_inline_scope, + visited=my_visited, + **(dict(level=lvl) | kwargs), + ) + if part.text: + parts.append(part) return parts @@ -578,20 +583,21 @@ def serialize_meta( ) -> SerializationResult: """Serialize the item's meta.""" if self.meta_serializer: - return self.meta_serializer.serialize( - item=item, - doc=self.doc, - **kwargs, - ) + if item.self_ref not in self.get_excluded_refs(**kwargs): + return self.meta_serializer.serialize( + item=item, + doc=self.doc, + **kwargs, + ) + else: + return create_ser_result( + text="", span_source=item if isinstance(item, DocItem) else [] + ) else: _logger.warning("No meta serializer found.") return create_ser_result( text="", span_source=item if isinstance(item, DocItem) else [] ) - # return create_ser_result( - # text=item.meta.model_dump_json() if item.meta else "", - # span_source=item, - # ) # TODO deprecate @override diff --git a/docling_core/transforms/serializer/markdown.py b/docling_core/transforms/serializer/markdown.py index 45e7b718..5b9f3e5e 100644 --- a/docling_core/transforms/serializer/markdown.py +++ b/docling_core/transforms/serializer/markdown.py @@ -805,3 +805,22 @@ def serialize_doc( def requires_page_break(self) -> bool: """Whether to add page breaks.""" return self.params.page_break_placeholder is not None + + @override + def serialize( + self, + *, + item: Optional[NodeItem] = None, + list_level: int = 0, + is_inline_scope: bool = False, + visited: Optional[set[str]] = None, + **kwargs: Any, + ) -> SerializationResult: + """Serialize a given node.""" + return super().serialize( + item=item, + list_level=list_level, + is_inline_scope=is_inline_scope, + visited=visited, + **(dict(delim="\n\n") | kwargs), + ) diff --git a/test/data/chunker/0_out_chunks.json b/test/data/chunker/0_out_chunks.json index a32d9912..f0eefe5a 100644 --- a/test/data/chunker/0_out_chunks.json +++ b/test/data/chunker/0_out_chunks.json @@ -1,5 +1,45 @@ { "root": [ + { + "text": "In this image we can see a cartoon image of a duck holding a paper.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/pictures/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "meta": {}, + "label": "picture", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 261.966552734375, + "t": 715.8966522216797, + "r": 348.65899658203125, + "b": 627.1333770751953, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, { "text": "Version 1.0", "meta": { @@ -813,11 +853,86 @@ } }, { - "text": "Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.", + "text": "In this image, we can see some text and images.\n\nFigure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.", "meta": { "schema_name": "docling_core.transforms.chunker.DocMeta", "version": "1.0.0", "doc_items": [ + { + "self_ref": "#/pictures/1", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/31" + }, + { + "$ref": "#/texts/32" + }, + { + "$ref": "#/texts/33" + }, + { + "$ref": "#/texts/34" + }, + { + "$ref": "#/texts/35" + }, + { + "$ref": "#/texts/36" + }, + { + "$ref": "#/texts/37" + }, + { + "$ref": "#/texts/38" + }, + { + "$ref": "#/texts/39" + }, + { + "$ref": "#/texts/40" + }, + { + "$ref": "#/texts/41" + }, + { + "$ref": "#/texts/42" + }, + { + "$ref": "#/texts/43" + }, + { + "$ref": "#/texts/44" + }, + { + "$ref": "#/texts/45" + }, + { + "$ref": "#/texts/46" + } + ], + "content_layer": "body", + "meta": {}, + "label": "picture", + "prov": [ + { + "page_no": 3, + "bbox": { + "l": 110.07231140136719, + "t": 719.2913360595703, + "r": 500.7577209472656, + "b": 581.2926177978516, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + }, { "self_ref": "#/texts/31", "parent": { @@ -3146,1210 +3261,3212 @@ } }, { - "text": "AGL Energy Limited ABN 74 1", + "text": "In this image there is a table with some text on it.", "meta": { "schema_name": "docling_core.transforms.chunker.DocMeta", "version": "1.0.0", "doc_items": [ { - "self_ref": "#/texts/393", + "self_ref": "#/pictures/2", "parent": { "$ref": "#/body" }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "children": [ { - "page_no": 7, - "bbox": { - "l": 226.786, - "t": 560.516, - "r": 233.176, - "b": 559.937, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 28 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACM Reference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "5 061 375", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/394", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/129" + }, { - "page_no": 7, - "bbox": { - "l": 233.40500000000003, - "t": 560.516, - "r": 235.66499999999996, - "b": 559.937, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 9 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACM Reference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Figure 1: Four examples of complex page layouts across different document categories", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/503", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/130" + }, { - "page_no": 7, - "bbox": { - "l": 222.539, - "t": 499.2799999999999, - "r": 312.251, - "b": 490.75200000000007, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 84 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACM Reference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "PDF document conversion, layout segmentation, object-detection, data set, Machine Learning", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/505", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/131" + }, { - "page_no": 7, - "bbox": { - "l": 222.539, - "t": 474.62299999999993, - "r": 312.021, - "b": 465.961, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 90 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "KEYWORDS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/507", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/132" + }, { - "page_no": 7, - "bbox": { - "l": 222.539, - "t": 458.719, - "r": 312.156, - "b": 436.156, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 374 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "1 INTRODUCTION", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/508", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/133" + }, { - "page_no": 7, - "bbox": { - "l": 329.602, - "t": 428.537, - "r": 373.375, - "b": 423.963, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 14 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1. Figure 2: Title page of the DocLayNet paper (arxiv.org/pdf/2206.01062) - left PDF, right rendered Markdown. If recognized, metadata such as authors are appearing first under the title. Text content inside figures is currently dropped, the caption is retained and linked to the figure in the JSON representation (not shown).", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/509", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/134" + }, { - "page_no": 7, - "bbox": { - "l": 108.0, - "t": 419.051, - "r": 527.591, - "b": 377.77099999999996, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 1026 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/511", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/135" + }, { - "page_no": 8, - "bbox": { - "l": 122.99899999999998, - "t": 563.105, - "r": 338.603, - "b": 558.655, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 130 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/512", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/136" + }, { - "page_no": 8, - "bbox": { - "l": 122.87200000000001, - "t": 552.103, - "r": 226.37599999999998, - "b": 509.485, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 489 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All, human = 84-89 83-91 83-85 87-88 93-94 85-89 69-71 83-84 77-81 84-86. Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All, MRCNN R50 R101 = 68.4 71.5 70.9 60.1 63.4 81.2 80.8 61.6 59.3 71.9 70.0 71.7 72.7 67.6 69.3 82.2 82.9 85.8 76.7 80.4 72.4 73.5. Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All, FRCNN R101 = 70.1 73.7 63.5 81.0 58.9 72.0 72.0 68.4 82.2 85.4 79.9 73.4. Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All, YOLO v5x6 = 77.7 77.2 66.2 86.2 61.1 67.9 74.6 86.3 88.1 82.7 76.8", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/tables/1", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "table", - "prov": [ + "$ref": "#/texts/137" + }, { - "page_no": 8, - "bbox": { - "l": 125.8864517211914, - "t": 505.50439453125, - "r": 223.0050506591797, - "b": 437.8017272949219, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/513", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/138" + }, { - "page_no": 8, - "bbox": { - "l": 122.884, - "t": 431.161, - "r": 226.336, - "b": 341.5470000000001, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 1252 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/515", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/139" + }, { - "page_no": 8, - "bbox": { - "l": 122.86499999999998, - "t": 327.581, - "r": 226.282, - "b": 284.81, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 584 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "5 EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Third, achienec", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/516", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/140" + }, { - "page_no": 8, - "bbox": { - "l": 436.0, - "t": 447.0, - "r": 509.66666666666663, - "b": 418.66666666666663, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 15 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "5 EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "chalenongayouls ground-vuth dawa such WC", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/518", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/141" + }, { - "page_no": 8, - "bbox": { - "l": 366.0, - "t": 386.0, - "r": 529.3333333333334, - "b": 375.33333333333337, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 40 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/519", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/142" + }, { - "page_no": 8, - "bbox": { - "l": 235.911, - "t": 469.97300000000007, - "r": 339.288, - "b": 441.408, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 322 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/520", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/143" + }, { - "page_no": 8, - "bbox": { - "l": 235.911, - "t": 425.568, - "r": 338.603, - "b": 415.587, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 102 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16].", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/521", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/144" + }, { - "page_no": 8, - "bbox": { - "l": 235.776, - "t": 416.19999999999993, - "r": 338.703, - "b": 382.7970000000001, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 397 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 \u00d7 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/523", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/145" + }, { - "page_no": 8, - "bbox": { - "l": 235.823, - "t": 370.85, - "r": 338.7, - "b": 285.921, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 1146 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "coioct dcochon modols", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/524", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/146" + }, { - "page_no": 8, - "bbox": { - "l": 456.6666666666667, - "t": 344.0, - "r": 485.33333333333337, - "b": 341.33333333333337, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 21 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "mak enbrel", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/526", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/147" + }, { - "page_no": 8, - "bbox": { - "l": 470.6666666666667, - "t": 308.6666666666667, - "r": 524.0, - "b": 285.3333333333333, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 10 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Figure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/527", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/148" + }, { - "page_no": 8, - "bbox": { - "l": 108.0, - "t": 266.424, - "r": 504.00300000000004, - "b": 225.14499999999998, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 393 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "KDD '22, August 14-18, 2022, Washington, DC, USA", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/529", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/149" + }, { - "page_no": 9, - "bbox": { - "l": 88.676, - "t": 598.985, - "r": 186.95, - "b": 593.669, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 48 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/530", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/150" + }, { - "page_no": 9, - "bbox": { - "l": 190.471, - "t": 598.985, - "r": 346.254, - "b": 593.669, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 81 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as %", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/531", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/151" + }, { - "page_no": 9, - "bbox": { - "l": 88.525, - "t": 586.821, - "r": 346.401, - "b": 580.676, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 123 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/532", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/152" + }, { - "page_no": 9, - "bbox": { - "l": 88.676, - "t": 575.628, - "r": 301.135, - "b": 569.484, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 99 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/533", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/153" + }, { - "page_no": 9, - "bbox": { - "l": 88.676, - "t": 581.225, - "r": 346.254, - "b": 575.08, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 124 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Caption, Count.Count = 22524. Caption, % of Total.Train = 2.04. Caption, % of Total.Test = 1.77. Caption, % of Total.Val = 2.32. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).All = 84-89. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 40-61. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 86-92. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 94-99. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 95-99. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 69-78. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = n/a. Footnote, Count.Count = 6318. Footnote, % of Total.Train = 0.60. Footnote, % of Total.Test = 0.31. Footnote, % of Total.Val = 0.58. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).All = 83-91. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = n/a. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 100. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 62-88. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 85-94. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = n/a. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 82-97. Formula, Count.Count = 25027. Formula, % of Total.Train = 2.25. Formula, % of Total.Test = 1.90. Formula, % of Total.Val = 2.96. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).All = 83-85. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = n/a. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Man = n/a. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 84-87. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 86-96. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = n/a. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = n/a. List-item, Count.Count = 185660. List-item, % of Total.Train = 17.19. List-item, % of Total.Test = 13.34. List-item, % of Total.Val = 15.82. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).All = 87-88. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 74-83. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 90-92. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 97-97. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 81-85. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 75-88. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 93-95. Page-footer, Count.Count = 70878. Page-footer, % of Total.Train = 6.51. Page-footer, % of Total.Test = 5.58. Page-footer, % of Total.Val = 6.00. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).All = 93-94. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 88-90. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 95-96. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 100. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 92-97. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 100. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 96-98. Page-header, Count.Count = 58022. Page-header, % of Total.Train = 5.10. Page-header, % of Total.Test = 6.70. Page-header, % of Total.Val = 5.06. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).All = 85-89. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 66-76. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 90-94. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 98-100. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 91-92. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 97-99. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 81-86. Picture, Count.Count = 45976. Picture, % of Total.Train = 4.21. Picture, % of Total.Test = 2.78. Picture, % of Total.Val = 5.31. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).All = 69-71. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 56-59. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 82-86. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 69-82. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 80-95. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 66-71. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 59-76. Section-header, Count.Count = 142884. Section-header, % of Total.Train = 12.60. Section-header, % of Total.Test = 15.77. Section-header, % of Total.Val = 12.85. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).All = 83-84. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 76-81. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 90-92. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 94-95. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 87-94. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 69-73. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 78-86. Table, Count.Count = 34733. Table, % of Total.Train = 3.20. Table, % of Total.Test = 2.27. Table, % of Total.Val = 3.60. Table, triple inter-annotator mAP @ 0.5-0.95 (%).All = 77-81. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 75-80. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 83-86. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 98-99. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 58-80. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 79-84. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 70-85. Text, Count.Count = 510377. Text, % of Total.Train = 45.82. Text, % of Total.Test = 49.28. Text, % of Total.Val = 45.00. Text, triple inter-annotator mAP @ 0.5-0.95 (%).All = 84-86. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 81-86. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 88-93. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 89-93. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 87-92. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 71-79. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 87-95. Title, Count.Count = 5071. Title, % of Total.Train = 0.47. Title, % of Total.Test = 0.30. Title, % of Total.Val = 0.50. Title, triple inter-annotator mAP @ 0.5-0.95 (%).All = 60-72. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 24-63. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 50-63. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 94-100. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 82-96. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 68-79. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 24-56. Total, Count.Count = 1107470. Total, % of Total.Train = 941123. Total, % of Total.Test = 99816. Total, % of Total.Val = 66531. Total, triple inter-annotator mAP @ 0.5-0.95 (%).All = 82-83. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 71-74. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 79-81. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 89-94. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 86-91. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 71-76. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 68-85", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/tables/3", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "table", - "prov": [ + "$ref": "#/texts/154" + }, { - "page_no": 9, - "bbox": { - "l": 110.8309097290039, - "t": 560.6356811523438, - "r": 323.92962646484375, - "b": 477.741455078125, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "include publication repositories such as arXiv", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/695", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/155" + }, { - "page_no": 9, - "bbox": { - "l": 223.57, - "t": 471.407, - "r": 306.847, - "b": 465.079, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 46 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { + "$ref": "#/texts/156" + }, + { + "$ref": "#/texts/157" + }, + { + "$ref": "#/texts/158" + }, + { + "$ref": "#/texts/159" + }, + { + "$ref": "#/texts/160" + }, + { + "$ref": "#/texts/161" + }, + { + "$ref": "#/texts/162" + }, + { + "$ref": "#/texts/163" + }, + { + "$ref": "#/texts/164" + }, + { + "$ref": "#/texts/165" + }, + { + "$ref": "#/texts/166" + }, + { + "$ref": "#/texts/167" + }, + { + "$ref": "#/texts/168" + }, + { + "$ref": "#/texts/169" + }, + { + "$ref": "#/texts/170" + }, + { + "$ref": "#/texts/171" + }, + { + "$ref": "#/texts/172" + }, + { + "$ref": "#/texts/173" + }, + { + "$ref": "#/texts/174" + }, + { + "$ref": "#/texts/175" + }, + { + "$ref": "#/texts/176" + }, + { + "$ref": "#/texts/177" + }, + { + "$ref": "#/texts/178" + }, + { + "$ref": "#/texts/179" + }, + { + "$ref": "#/texts/180" + }, + { + "$ref": "#/texts/181" + }, + { + "$ref": "#/texts/182" + }, + { + "$ref": "#/texts/183" + }, + { + "$ref": "#/texts/184" + }, + { + "$ref": "#/texts/185" + }, + { + "$ref": "#/texts/186" + }, + { + "$ref": "#/texts/187" + }, + { + "$ref": "#/texts/188" + }, + { + "$ref": "#/texts/189" + }, + { + "$ref": "#/texts/190" + }, + { + "$ref": "#/texts/191" + }, + { + "$ref": "#/texts/192" + }, + { + "$ref": "#/texts/193" + }, + { + "$ref": "#/texts/194" + }, + { + "$ref": "#/texts/195" + }, + { + "$ref": "#/texts/196" + }, + { + "$ref": "#/texts/197" + }, + { + "$ref": "#/texts/198" + }, + { + "$ref": "#/texts/199" + }, + { + "$ref": "#/texts/200" + }, + { + "$ref": "#/texts/201" + }, + { + "$ref": "#/texts/202" + }, + { + "$ref": "#/texts/203" + }, + { + "$ref": "#/texts/204" + }, + { + "$ref": "#/texts/205" + }, + { + "$ref": "#/texts/206" + }, + { + "$ref": "#/texts/207" + }, + { + "$ref": "#/texts/208" + }, + { + "$ref": "#/texts/209" + }, + { + "$ref": "#/texts/210" + }, + { + "$ref": "#/texts/211" + }, + { + "$ref": "#/texts/212" + }, + { + "$ref": "#/texts/213" + }, + { + "$ref": "#/texts/214" + }, + { + "$ref": "#/texts/215" + }, + { + "$ref": "#/texts/216" + }, + { + "$ref": "#/texts/217" + }, + { + "$ref": "#/texts/218" + }, + { + "$ref": "#/texts/219" + }, + { + "$ref": "#/texts/220" + }, + { + "$ref": "#/texts/221" + }, + { + "$ref": "#/texts/222" + }, + { + "$ref": "#/texts/223" + }, + { + "$ref": "#/texts/224" + }, + { + "$ref": "#/texts/225" + }, + { + "$ref": "#/texts/226" + }, + { + "$ref": "#/texts/227" + }, + { + "$ref": "#/texts/228" + }, + { + "$ref": "#/texts/229" + }, + { + "$ref": "#/texts/230" + }, + { + "$ref": "#/texts/231" + }, + { + "$ref": "#/texts/232" + }, + { + "$ref": "#/texts/233" + }, + { + "$ref": "#/texts/234" + }, + { + "$ref": "#/texts/235" + }, + { + "$ref": "#/texts/236" + }, + { + "$ref": "#/texts/237" + }, + { + "$ref": "#/texts/238" + }, + { + "$ref": "#/texts/239" + }, + { + "$ref": "#/texts/240" + }, + { + "$ref": "#/texts/241" + }, + { + "$ref": "#/texts/242" + }, + { + "$ref": "#/texts/243" + }, + { + "$ref": "#/texts/244" + }, + { + "$ref": "#/texts/245" + }, + { + "$ref": "#/texts/246" + }, + { + "$ref": "#/texts/247" + }, + { + "$ref": "#/texts/248" + }, + { + "$ref": "#/texts/249" + }, + { + "$ref": "#/texts/250" + }, + { + "$ref": "#/texts/251" + }, + { + "$ref": "#/texts/252" + }, + { + "$ref": "#/texts/253" + }, + { + "$ref": "#/texts/254" + }, + { + "$ref": "#/texts/255" + }, + { + "$ref": "#/texts/256" + }, + { + "$ref": "#/texts/257" + }, + { + "$ref": "#/texts/258" + }, + { + "$ref": "#/texts/259" + }, + { + "$ref": "#/texts/260" + }, + { + "$ref": "#/texts/261" + }, + { + "$ref": "#/texts/262" + }, + { + "$ref": "#/texts/263" + }, + { + "$ref": "#/texts/264" + }, + { + "$ref": "#/texts/265" + }, + { + "$ref": "#/texts/266" + }, + { + "$ref": "#/texts/267" + }, + { + "$ref": "#/texts/268" + }, + { + "$ref": "#/texts/269" + }, + { + "$ref": "#/texts/270" + }, + { + "$ref": "#/texts/271" + }, + { + "$ref": "#/texts/272" + }, + { + "$ref": "#/texts/273" + }, + { + "$ref": "#/texts/274" + }, + { + "$ref": "#/texts/275" + }, + { + "$ref": "#/texts/276" + }, + { + "$ref": "#/texts/277" + }, + { + "$ref": "#/texts/278" + }, + { + "$ref": "#/texts/279" + }, + { + "$ref": "#/texts/280" + }, + { + "$ref": "#/texts/281" + }, + { + "$ref": "#/texts/282" + }, + { + "$ref": "#/texts/283" + }, + { + "$ref": "#/texts/284" + }, + { + "$ref": "#/texts/285" + }, + { + "$ref": "#/texts/286" + }, + { + "$ref": "#/texts/287" + }, + { + "$ref": "#/texts/288" + }, + { + "$ref": "#/texts/289" + }, + { + "$ref": "#/texts/290" + }, + { + "$ref": "#/texts/291" + }, + { + "$ref": "#/texts/292" + }, + { + "$ref": "#/texts/293" + }, + { + "$ref": "#/texts/294" + }, + { + "$ref": "#/texts/295" + }, + { + "$ref": "#/texts/296" + }, + { + "$ref": "#/texts/297" + }, + { + "$ref": "#/texts/298" + }, + { + "$ref": "#/texts/299" + }, + { + "$ref": "#/texts/300" + }, + { + "$ref": "#/texts/301" + } + ], + "content_layer": "body", + "meta": {}, + "label": "picture", + "prov": [ + { + "page_no": 7, + "bbox": { + "l": 223.45245361328125, + "t": 606.3411560058594, + "r": 277.1462707519531, + "b": 563.2440032958984, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACM Reference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "In this image we can see a text.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/pictures/3", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/302" + }, + { + "$ref": "#/texts/303" + }, + { + "$ref": "#/texts/304" + }, + { + "$ref": "#/texts/305" + }, + { + "$ref": "#/texts/306" + }, + { + "$ref": "#/texts/307" + }, + { + "$ref": "#/texts/308" + }, + { + "$ref": "#/texts/309" + }, + { + "$ref": "#/texts/310" + }, + { + "$ref": "#/texts/311" + }, + { + "$ref": "#/texts/312" + }, + { + "$ref": "#/texts/313" + }, + { + "$ref": "#/texts/314" + }, + { + "$ref": "#/texts/315" + }, + { + "$ref": "#/texts/316" + }, + { + "$ref": "#/texts/317" + }, + { + "$ref": "#/texts/318" + }, + { + "$ref": "#/texts/319" + }, + { + "$ref": "#/texts/320" + }, + { + "$ref": "#/texts/321" + }, + { + "$ref": "#/texts/322" + }, + { + "$ref": "#/texts/323" + }, + { + "$ref": "#/texts/324" + }, + { + "$ref": "#/texts/325" + }, + { + "$ref": "#/texts/326" + }, + { + "$ref": "#/texts/327" + }, + { + "$ref": "#/texts/328" + }, + { + "$ref": "#/texts/329" + }, + { + "$ref": "#/texts/330" + }, + { + "$ref": "#/texts/331" + }, + { + "$ref": "#/texts/332" + }, + { + "$ref": "#/texts/333" + }, + { + "$ref": "#/texts/334" + }, + { + "$ref": "#/texts/335" + }, + { + "$ref": "#/texts/336" + }, + { + "$ref": "#/texts/337" + }, + { + "$ref": "#/texts/338" + }, + { + "$ref": "#/texts/339" + }, + { + "$ref": "#/texts/340" + }, + { + "$ref": "#/texts/341" + }, + { + "$ref": "#/texts/342" + }, + { + "$ref": "#/texts/343" + }, + { + "$ref": "#/texts/344" + }, + { + "$ref": "#/texts/345" + }, + { + "$ref": "#/texts/346" + }, + { + "$ref": "#/texts/347" + }, + { + "$ref": "#/texts/348" + }, + { + "$ref": "#/texts/349" + }, + { + "$ref": "#/texts/350" + }, + { + "$ref": "#/texts/351" + }, + { + "$ref": "#/texts/352" + }, + { + "$ref": "#/texts/353" + }, + { + "$ref": "#/texts/354" + }, + { + "$ref": "#/texts/355" + }, + { + "$ref": "#/texts/356" + }, + { + "$ref": "#/texts/357" + }, + { + "$ref": "#/texts/358" + }, + { + "$ref": "#/texts/359" + }, + { + "$ref": "#/texts/360" + }, + { + "$ref": "#/texts/361" + }, + { + "$ref": "#/texts/362" + }, + { + "$ref": "#/texts/363" + }, + { + "$ref": "#/texts/364" + }, + { + "$ref": "#/texts/365" + }, + { + "$ref": "#/texts/366" + }, + { + "$ref": "#/texts/367" + }, + { + "$ref": "#/texts/368" + }, + { + "$ref": "#/texts/369" + }, + { + "$ref": "#/texts/370" + }, + { + "$ref": "#/texts/371" + }, + { + "$ref": "#/texts/372" + }, + { + "$ref": "#/texts/373" + }, + { + "$ref": "#/texts/374" + }, + { + "$ref": "#/texts/375" + }, + { + "$ref": "#/texts/376" + }, + { + "$ref": "#/texts/377" + }, + { + "$ref": "#/texts/378" + }, + { + "$ref": "#/texts/379" + }, + { + "$ref": "#/texts/380" + }, + { + "$ref": "#/texts/381" + }, + { + "$ref": "#/texts/382" + }, + { + "$ref": "#/texts/383" + }, + { + "$ref": "#/texts/384" + }, + { + "$ref": "#/texts/385" + }, + { + "$ref": "#/texts/386" + }, + { + "$ref": "#/texts/387" + }, + { + "$ref": "#/texts/388" + }, + { + "$ref": "#/texts/389" + }, + { + "$ref": "#/texts/390" + }, + { + "$ref": "#/texts/391" + }, + { + "$ref": "#/texts/392" + } + ], + "content_layer": "body", + "meta": {}, + "label": "picture", + "prov": [ + { + "page_no": 7, + "bbox": { + "l": 279.03204345703125, + "t": 607.0251770019531, + "r": 312.2338562011719, + "b": 562.7499389648438, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACM Reference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "AGL Energy Limited ABN 74 1", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/393", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 7, + "bbox": { + "l": 226.786, + "t": 560.516, + "r": 233.176, + "b": 559.937, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 28 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACM Reference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "5 061 375", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/394", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 7, + "bbox": { + "l": 233.40500000000003, + "t": 560.516, + "r": 235.66499999999996, + "b": 559.937, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 9 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACM Reference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "In this image I can see the text on the image.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/pictures/4", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/395" + }, + { + "$ref": "#/texts/396" + }, + { + "$ref": "#/texts/397" + }, + { + "$ref": "#/texts/398" + }, + { + "$ref": "#/texts/399" + }, + { + "$ref": "#/texts/400" + }, + { + "$ref": "#/texts/401" + }, + { + "$ref": "#/texts/402" + }, + { + "$ref": "#/texts/403" + }, + { + "$ref": "#/texts/404" + }, + { + "$ref": "#/texts/405" + }, + { + "$ref": "#/texts/406" + }, + { + "$ref": "#/texts/407" + }, + { + "$ref": "#/texts/408" + }, + { + "$ref": "#/texts/409" + }, + { + "$ref": "#/texts/410" + }, + { + "$ref": "#/texts/411" + }, + { + "$ref": "#/texts/412" + }, + { + "$ref": "#/texts/413" + }, + { + "$ref": "#/texts/414" + }, + { + "$ref": "#/texts/415" + }, + { + "$ref": "#/texts/416" + }, + { + "$ref": "#/texts/417" + }, + { + "$ref": "#/texts/418" + }, + { + "$ref": "#/texts/419" + }, + { + "$ref": "#/texts/420" + }, + { + "$ref": "#/texts/421" + }, + { + "$ref": "#/texts/422" + }, + { + "$ref": "#/texts/423" + }, + { + "$ref": "#/texts/424" + }, + { + "$ref": "#/texts/425" + }, + { + "$ref": "#/texts/426" + }, + { + "$ref": "#/texts/427" + }, + { + "$ref": "#/texts/428" + }, + { + "$ref": "#/texts/429" + }, + { + "$ref": "#/texts/430" + }, + { + "$ref": "#/texts/431" + }, + { + "$ref": "#/texts/432" + }, + { + "$ref": "#/texts/433" + }, + { + "$ref": "#/texts/434" + }, + { + "$ref": "#/texts/435" + }, + { + "$ref": "#/texts/436" + }, + { + "$ref": "#/texts/437" + }, + { + "$ref": "#/texts/438" + }, + { + "$ref": "#/texts/439" + }, + { + "$ref": "#/texts/440" + } + ], + "content_layer": "body", + "meta": {}, + "label": "picture", + "prov": [ + { + "page_no": 7, + "bbox": { + "l": 224.6795196533203, + "t": 559.731201171875, + "r": 268.13018798828125, + "b": 503.4937438964844, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACM Reference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "In this image there is a paper with some text on it.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/pictures/5", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/441" + }, + { + "$ref": "#/texts/442" + }, + { + "$ref": "#/texts/443" + }, + { + "$ref": "#/texts/444" + }, + { + "$ref": "#/texts/445" + }, + { + "$ref": "#/texts/446" + }, + { + "$ref": "#/texts/447" + }, + { + "$ref": "#/texts/448" + }, + { + "$ref": "#/texts/449" + }, + { + "$ref": "#/texts/450" + }, + { + "$ref": "#/texts/451" + }, + { + "$ref": "#/texts/452" + }, + { + "$ref": "#/texts/453" + }, + { + "$ref": "#/texts/454" + }, + { + "$ref": "#/texts/455" + }, + { + "$ref": "#/texts/456" + }, + { + "$ref": "#/texts/457" + }, + { + "$ref": "#/texts/458" + }, + { + "$ref": "#/texts/459" + }, + { + "$ref": "#/texts/460" + }, + { + "$ref": "#/texts/461" + }, + { + "$ref": "#/texts/462" + }, + { + "$ref": "#/texts/463" + }, + { + "$ref": "#/texts/464" + }, + { + "$ref": "#/texts/465" + }, + { + "$ref": "#/texts/466" + }, + { + "$ref": "#/texts/467" + }, + { + "$ref": "#/texts/468" + }, + { + "$ref": "#/texts/469" + }, + { + "$ref": "#/texts/470" + }, + { + "$ref": "#/texts/471" + }, + { + "$ref": "#/texts/472" + }, + { + "$ref": "#/texts/473" + }, + { + "$ref": "#/texts/474" + }, + { + "$ref": "#/texts/475" + }, + { + "$ref": "#/texts/476" + }, + { + "$ref": "#/texts/477" + }, + { + "$ref": "#/texts/478" + }, + { + "$ref": "#/texts/479" + }, + { + "$ref": "#/texts/480" + }, + { + "$ref": "#/texts/481" + }, + { + "$ref": "#/texts/482" + }, + { + "$ref": "#/texts/483" + }, + { + "$ref": "#/texts/484" + }, + { + "$ref": "#/texts/485" + }, + { + "$ref": "#/texts/486" + }, + { + "$ref": "#/texts/487" + }, + { + "$ref": "#/texts/488" + }, + { + "$ref": "#/texts/489" + }, + { + "$ref": "#/texts/490" + }, + { + "$ref": "#/texts/491" + }, + { + "$ref": "#/texts/492" + }, + { + "$ref": "#/texts/493" + }, + { + "$ref": "#/texts/494" + }, + { + "$ref": "#/texts/495" + }, + { + "$ref": "#/texts/496" + }, + { + "$ref": "#/texts/497" + }, + { + "$ref": "#/texts/498" + }, + { + "$ref": "#/texts/499" + }, + { + "$ref": "#/texts/500" + }, + { + "$ref": "#/texts/501" + }, + { + "$ref": "#/texts/502" + } + ], + "content_layer": "body", + "meta": {}, + "label": "picture", + "prov": [ + { + "page_no": 7, + "bbox": { + "l": 269.2328186035156, + "t": 558.8644409179688, + "r": 311.74884033203125, + "b": 502.994873046875, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACM Reference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Figure 1: Four examples of complex page layouts across different document categories", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/503", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 7, + "bbox": { + "l": 222.539, + "t": 499.2799999999999, + "r": 312.251, + "b": 490.75200000000007, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 84 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACM Reference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "PDF document conversion, layout segmentation, object-detection, data set, Machine Learning", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/505", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 7, + "bbox": { + "l": 222.539, + "t": 474.62299999999993, + "r": 312.021, + "b": 465.961, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 90 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "KEYWORDS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/507", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 7, + "bbox": { + "l": 222.539, + "t": 458.719, + "r": 312.156, + "b": 436.156, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 374 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "1 INTRODUCTION", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/508", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 7, + "bbox": { + "l": 329.602, + "t": 428.537, + "r": 373.375, + "b": 423.963, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 14 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1. Figure 2: Title page of the DocLayNet paper (arxiv.org/pdf/2206.01062) - left PDF, right rendered Markdown. If recognized, metadata such as authors are appearing first under the title. Text content inside figures is currently dropped, the caption is retained and linked to the figure in the JSON representation (not shown).", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/509", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 7, + "bbox": { + "l": 108.0, + "t": 419.051, + "r": 527.591, + "b": 377.77099999999996, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 1026 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/511", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 122.99899999999998, + "t": 563.105, + "r": 338.603, + "b": 558.655, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 130 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/512", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 122.87200000000001, + "t": 552.103, + "r": 226.37599999999998, + "b": 509.485, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 489 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All, human = 84-89 83-91 83-85 87-88 93-94 85-89 69-71 83-84 77-81 84-86. Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All, MRCNN R50 R101 = 68.4 71.5 70.9 60.1 63.4 81.2 80.8 61.6 59.3 71.9 70.0 71.7 72.7 67.6 69.3 82.2 82.9 85.8 76.7 80.4 72.4 73.5. Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All, FRCNN R101 = 70.1 73.7 63.5 81.0 58.9 72.0 72.0 68.4 82.2 85.4 79.9 73.4. Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All, YOLO v5x6 = 77.7 77.2 66.2 86.2 61.1 67.9 74.6 86.3 88.1 82.7 76.8", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/tables/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 125.8864517211914, + "t": 505.50439453125, + "r": 223.0050506591797, + "b": 437.8017272949219, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/513", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 122.884, + "t": 431.161, + "r": 226.336, + "b": 341.5470000000001, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 1252 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/515", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 122.86499999999998, + "t": 327.581, + "r": 226.282, + "b": 284.81, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 584 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "5 EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "In this image, we can see a table.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/pictures/6", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "meta": {}, + "label": "picture", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 366.8663635253906, + "t": 542.9663391113281, + "r": 460.8086242675781, + "b": 450.9350280761719, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "5 EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Third, achienec", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/516", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 436.0, + "t": 447.0, + "r": 509.66666666666663, + "b": 418.66666666666663, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 15 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "5 EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "chalenongayouls ground-vuth dawa such WC", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/518", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 366.0, + "t": 386.0, + "r": 529.3333333333334, + "b": 375.33333333333337, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 40 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "The image is a line graph that shows the percentage of respondents who have used a specific type of training in the past 24 hours. The x-axis represents the time in hours, while the y-axis represents the percentage of respondents. The graph is titled \"Training.\"\n\nThe graph has two lines: one for the training type and one for the training duration. The training type line is labeled \"Training\" and the training duration line is labeled \"Training Duration.\"\n\nThe graph shows that the percentage of respondents who have used the training type increases as the time increases. Specifically, the percentage of respondents who have used the training type increases from 10% to 20% over the 24-hour period. The percentage of respondents who have used the training duration increases from 10% to 20% over the 24-hour period.\n\nThe graph also shows that the percentage of respondents who have used the training type increases as the", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/pictures/7", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "meta": {}, + "label": "picture", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 237.6404266357422, + "t": 550.1458740234375, + "r": 337.0112609863281, + "b": 477.0093078613281, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/519", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 235.911, + "t": 469.97300000000007, + "r": 339.288, + "b": 441.408, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 322 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/520", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 235.911, + "t": 425.568, + "r": 338.603, + "b": 415.587, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 102 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16].", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/521", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 235.776, + "t": 416.19999999999993, + "r": 338.703, + "b": 382.7970000000001, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 397 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 \u00d7 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/523", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 235.823, + "t": 370.85, + "r": 338.7, + "b": 285.921, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 1146 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "coioct dcochon modols", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/524", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 456.6666666666667, + "t": 344.0, + "r": 485.33333333333337, + "b": 341.33333333333337, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 21 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "mak enbrel", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/526", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 470.6666666666667, + "t": 308.6666666666667, + "r": 524.0, + "b": 285.3333333333333, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 10 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Figure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/527", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 108.0, + "t": 266.424, + "r": 504.00300000000004, + "b": 225.14499999999998, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 393 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "KDD '22, August 14-18, 2022, Washington, DC, USA", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/529", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 9, + "bbox": { + "l": 88.676, + "t": 598.985, + "r": 186.95, + "b": 593.669, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 48 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/530", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 9, + "bbox": { + "l": 190.471, + "t": 598.985, + "r": 346.254, + "b": 593.669, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 81 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as %", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/531", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 9, + "bbox": { + "l": 88.525, + "t": 586.821, + "r": 346.401, + "b": 580.676, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 123 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/532", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 9, + "bbox": { + "l": 88.676, + "t": 575.628, + "r": 301.135, + "b": 569.484, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 99 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/533", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 9, + "bbox": { + "l": 88.676, + "t": 581.225, + "r": 346.254, + "b": 575.08, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 124 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "The image consists of a blue circle with the letter \"A\" written in white color. The circle is placed on a white background, which makes the letter \"A\" stand out prominently. The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom. The gradient effect gives a sense of depth and movement, making the letter \"A\" stand out more.\n\n### Description of Objects in the Image:\n1. **Circle**: The central element of the image is a blue circle.\n2. **White Letter \"A\"**: The letter \"A\" is written in white color.\n3. **Background**: The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom.\n4. **Color Gradient**: The gradient effect is present, with the color transitioning from a lighter shade at the top to a darker shade at the bottom.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/pictures/8", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/534" + } + ], + "content_layer": "body", + "meta": {}, + "label": "picture", + "prov": [ + { + "page_no": 9, + "bbox": { + "l": 110.43017578125, + "t": 573.9806060791016, + "r": 124.71578216552734, + "b": 559.4710540771484, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "In this image, there is a table with two columns. The first column is labeled \"Class label,\" and the second column is labeled \"Count.\" The first row in the table has the label \"Class label,\" and the count is 22524. The second row in the table has the label \"Count,\" and the count is 204.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/pictures/9", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/535" + }, + { + "$ref": "#/texts/536" + }, + { + "$ref": "#/texts/537" + }, + { + "$ref": "#/texts/538" + }, + { + "$ref": "#/texts/539" + }, + { + "$ref": "#/texts/540" + }, + { + "$ref": "#/texts/541" + }, + { + "$ref": "#/texts/542" + }, + { + "$ref": "#/texts/543" + }, + { + "$ref": "#/texts/544" + }, + { + "$ref": "#/texts/545" + }, + { + "$ref": "#/texts/546" + }, + { + "$ref": "#/texts/547" + }, + { + "$ref": "#/texts/548" + }, + { + "$ref": "#/texts/549" + }, + { + "$ref": "#/texts/550" + }, + { + "$ref": "#/texts/551" + }, + { + "$ref": "#/texts/552" + }, + { + "$ref": "#/texts/553" + }, + { + "$ref": "#/texts/554" + }, + { + "$ref": "#/texts/555" + }, + { + "$ref": "#/texts/556" + }, + { + "$ref": "#/texts/557" + }, + { + "$ref": "#/texts/558" + }, + { + "$ref": "#/texts/559" + }, + { + "$ref": "#/texts/560" + }, + { + "$ref": "#/texts/561" + }, + { + "$ref": "#/texts/562" + }, + { + "$ref": "#/texts/563" + }, + { + "$ref": "#/texts/564" + }, + { + "$ref": "#/texts/565" + }, + { + "$ref": "#/texts/566" + }, + { + "$ref": "#/texts/567" + }, + { + "$ref": "#/texts/568" + }, + { + "$ref": "#/texts/569" + }, + { + "$ref": "#/texts/570" + }, + { + "$ref": "#/texts/571" + }, + { + "$ref": "#/texts/572" + }, + { + "$ref": "#/texts/573" + }, + { + "$ref": "#/texts/574" + }, + { + "$ref": "#/texts/575" + }, + { + "$ref": "#/texts/576" + }, + { + "$ref": "#/texts/577" + }, + { + "$ref": "#/texts/578" + }, + { + "$ref": "#/texts/579" + }, + { + "$ref": "#/texts/580" + }, + { + "$ref": "#/texts/581" + }, + { + "$ref": "#/texts/582" + }, + { + "$ref": "#/texts/583" + }, + { + "$ref": "#/texts/584" + }, + { + "$ref": "#/texts/585" + }, + { + "$ref": "#/texts/586" + }, + { + "$ref": "#/texts/587" + }, + { + "$ref": "#/texts/588" + }, + { + "$ref": "#/texts/589" + }, + { + "$ref": "#/texts/590" + }, + { + "$ref": "#/texts/591" + }, + { + "$ref": "#/texts/592" + }, + { + "$ref": "#/texts/593" + }, + { + "$ref": "#/texts/594" + }, + { + "$ref": "#/texts/595" + }, + { + "$ref": "#/texts/596" + }, + { + "$ref": "#/texts/597" + }, + { + "$ref": "#/texts/598" + }, + { + "$ref": "#/texts/599" + }, + { + "$ref": "#/texts/600" + }, + { + "$ref": "#/texts/601" + }, + { + "$ref": "#/texts/602" + }, + { + "$ref": "#/texts/603" + }, + { + "$ref": "#/texts/604" + }, + { + "$ref": "#/texts/605" + }, + { + "$ref": "#/texts/606" + }, + { + "$ref": "#/texts/607" + }, + { + "$ref": "#/texts/608" + }, + { + "$ref": "#/texts/609" + }, + { + "$ref": "#/texts/610" + }, + { + "$ref": "#/texts/611" + }, + { + "$ref": "#/texts/612" + }, + { + "$ref": "#/texts/613" + }, + { + "$ref": "#/texts/614" + }, + { + "$ref": "#/texts/615" + }, + { + "$ref": "#/texts/616" + }, + { + "$ref": "#/texts/617" + }, + { + "$ref": "#/texts/618" + }, + { + "$ref": "#/texts/619" + }, + { + "$ref": "#/texts/620" + }, + { + "$ref": "#/texts/621" + }, + { + "$ref": "#/texts/622" + }, + { + "$ref": "#/texts/623" + }, + { + "$ref": "#/texts/624" + }, + { + "$ref": "#/texts/625" + }, + { + "$ref": "#/texts/626" + }, + { + "$ref": "#/texts/627" + }, + { + "$ref": "#/texts/628" + }, + { + "$ref": "#/texts/629" + }, + { + "$ref": "#/texts/630" + }, + { + "$ref": "#/texts/631" + }, + { + "$ref": "#/texts/632" + }, + { + "$ref": "#/texts/633" + }, + { + "$ref": "#/texts/634" + }, + { + "$ref": "#/texts/635" + }, + { + "$ref": "#/texts/636" + }, + { + "$ref": "#/texts/637" + }, + { + "$ref": "#/texts/638" + }, + { + "$ref": "#/texts/639" + }, + { + "$ref": "#/texts/640" + }, + { + "$ref": "#/texts/641" + }, + { + "$ref": "#/texts/642" + }, + { + "$ref": "#/texts/643" + }, + { + "$ref": "#/texts/644" + }, + { + "$ref": "#/texts/645" + }, + { + "$ref": "#/texts/646" + }, + { + "$ref": "#/texts/647" + }, + { + "$ref": "#/texts/648" + }, + { + "$ref": "#/texts/649" + }, + { + "$ref": "#/texts/650" + }, + { + "$ref": "#/texts/651" + }, + { + "$ref": "#/texts/652" + }, + { + "$ref": "#/texts/653" + }, + { + "$ref": "#/texts/654" + }, + { + "$ref": "#/texts/655" + }, + { + "$ref": "#/texts/656" + }, + { + "$ref": "#/texts/657" + }, + { + "$ref": "#/texts/658" + }, + { + "$ref": "#/texts/659" + }, + { + "$ref": "#/texts/660" + }, + { + "$ref": "#/texts/661" + }, + { + "$ref": "#/texts/662" + }, + { + "$ref": "#/texts/663" + }, + { + "$ref": "#/texts/664" + }, + { + "$ref": "#/texts/665" + }, + { + "$ref": "#/texts/666" + }, + { + "$ref": "#/texts/667" + }, + { + "$ref": "#/texts/668" + }, + { + "$ref": "#/texts/669" + }, + { + "$ref": "#/texts/670" + }, + { + "$ref": "#/texts/671" + }, + { + "$ref": "#/texts/672" + }, + { + "$ref": "#/texts/673" + }, + { + "$ref": "#/texts/674" + }, + { + "$ref": "#/texts/675" + }, + { + "$ref": "#/texts/676" + }, + { + "$ref": "#/texts/677" + }, + { + "$ref": "#/texts/678" + }, + { + "$ref": "#/texts/679" + }, + { + "$ref": "#/texts/680" + }, + { + "$ref": "#/texts/681" + }, + { + "$ref": "#/texts/682" + }, + { + "$ref": "#/texts/683" + }, + { + "$ref": "#/texts/684" + }, + { + "$ref": "#/texts/685" + }, + { + "$ref": "#/texts/686" + }, + { + "$ref": "#/texts/687" + }, + { + "$ref": "#/texts/688" + }, + { + "$ref": "#/texts/689" + }, + { + "$ref": "#/texts/690" + }, + { + "$ref": "#/texts/691" + }, + { + "$ref": "#/texts/692" + }, + { + "$ref": "#/texts/693" + } + ], + "content_layer": "body", + "meta": {}, + "label": "picture", + "prov": [ + { + "page_no": 9, + "bbox": { + "l": 110.8309097290039, + "t": 560.6356811523438, + "r": 323.92962646484375, + "b": 477.741455078125, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Caption, Count.Count = 22524. Caption, % of Total.Train = 2.04. Caption, % of Total.Test = 1.77. Caption, % of Total.Val = 2.32. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).All = 84-89. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 40-61. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 86-92. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 94-99. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 95-99. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 69-78. Caption, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = n/a. Footnote, Count.Count = 6318. Footnote, % of Total.Train = 0.60. Footnote, % of Total.Test = 0.31. Footnote, % of Total.Val = 0.58. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).All = 83-91. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = n/a. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 100. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 62-88. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 85-94. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = n/a. Footnote, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 82-97. Formula, Count.Count = 25027. Formula, % of Total.Train = 2.25. Formula, % of Total.Test = 1.90. Formula, % of Total.Val = 2.96. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).All = 83-85. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = n/a. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Man = n/a. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 84-87. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 86-96. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = n/a. Formula, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = n/a. List-item, Count.Count = 185660. List-item, % of Total.Train = 17.19. List-item, % of Total.Test = 13.34. List-item, % of Total.Val = 15.82. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).All = 87-88. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 74-83. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 90-92. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 97-97. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 81-85. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 75-88. List-item, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 93-95. Page-footer, Count.Count = 70878. Page-footer, % of Total.Train = 6.51. Page-footer, % of Total.Test = 5.58. Page-footer, % of Total.Val = 6.00. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).All = 93-94. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 88-90. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 95-96. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 100. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 92-97. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 100. Page-footer, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 96-98. Page-header, Count.Count = 58022. Page-header, % of Total.Train = 5.10. Page-header, % of Total.Test = 6.70. Page-header, % of Total.Val = 5.06. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).All = 85-89. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 66-76. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 90-94. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 98-100. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 91-92. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 97-99. Page-header, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 81-86. Picture, Count.Count = 45976. Picture, % of Total.Train = 4.21. Picture, % of Total.Test = 2.78. Picture, % of Total.Val = 5.31. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).All = 69-71. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 56-59. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 82-86. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 69-82. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 80-95. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 66-71. Picture, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 59-76. Section-header, Count.Count = 142884. Section-header, % of Total.Train = 12.60. Section-header, % of Total.Test = 15.77. Section-header, % of Total.Val = 12.85. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).All = 83-84. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 76-81. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 90-92. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 94-95. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 87-94. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 69-73. Section-header, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 78-86. Table, Count.Count = 34733. Table, % of Total.Train = 3.20. Table, % of Total.Test = 2.27. Table, % of Total.Val = 3.60. Table, triple inter-annotator mAP @ 0.5-0.95 (%).All = 77-81. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 75-80. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 83-86. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 98-99. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 58-80. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 79-84. Table, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 70-85. Text, Count.Count = 510377. Text, % of Total.Train = 45.82. Text, % of Total.Test = 49.28. Text, % of Total.Val = 45.00. Text, triple inter-annotator mAP @ 0.5-0.95 (%).All = 84-86. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 81-86. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 88-93. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 89-93. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 87-92. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 71-79. Text, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 87-95. Title, Count.Count = 5071. Title, % of Total.Train = 0.47. Title, % of Total.Test = 0.30. Title, % of Total.Val = 0.50. Title, triple inter-annotator mAP @ 0.5-0.95 (%).All = 60-72. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 24-63. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 50-63. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 94-100. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 82-96. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 68-79. Title, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 24-56. Total, Count.Count = 1107470. Total, % of Total.Train = 941123. Total, % of Total.Test = 99816. Total, % of Total.Val = 66531. Total, triple inter-annotator mAP @ 0.5-0.95 (%).All = 82-83. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Fin = 71-74. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Man = 79-81. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Sci = 89-94. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Law = 86-91. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Pat = 71-76. Total, triple inter-annotator mAP @ 0.5-0.95 (%).Ten = 68-85", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/tables/3", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [ + { + "page_no": 9, + "bbox": { + "l": 110.8309097290039, + "t": 560.6356811523438, + "r": 323.92962646484375, + "b": 477.741455078125, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "In this image I can see a blue circle.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/pictures/10", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/694" + } + ], + "content_layer": "body", + "meta": {}, + "label": "picture", + "prov": [ + { + "page_no": 9, + "bbox": { + "l": 332.130615234375, + "t": 576.3017578125, + "r": 346.93829345703125, + "b": 560.4401550292969, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "include publication repositories such as arXiv", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/695", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 9, + "bbox": { + "l": 223.57, + "t": 471.407, + "r": 306.847, + "b": 465.079, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 46 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { "text": "Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row \"Total\") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-", "meta": { "schema_name": "docling_core.transforms.chunker.DocMeta", @@ -4365,74 +6482,743 @@ "label": "text", "prov": [ { - "page_no": 9, - "bbox": { - "l": 335.152, - "t": 573.216, - "r": 521.726, - "b": 570.514, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 146 - ] + "page_no": 9, + "bbox": { + "l": 335.152, + "t": 573.216, + "r": 521.726, + "b": 570.514, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 146 + ] + }, + { + "page_no": 9, + "bbox": { + "l": 335.152, + "t": 573.216, + "r": 521.726, + "b": 570.514, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 147, + 294 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "annotated pages, from which we obtain accuracy ranges.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/697", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 9, + "bbox": { + "l": 335.152, + "t": 564.097, + "r": 408.543, + "b": 561.395, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 54 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "A table with different columns and rows.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/pictures/11", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/698" + }, + { + "$ref": "#/texts/699" + }, + { + "$ref": "#/texts/700" + }, + { + "$ref": "#/texts/701" + }, + { + "$ref": "#/texts/702" + }, + { + "$ref": "#/texts/703" + }, + { + "$ref": "#/texts/704" + }, + { + "$ref": "#/texts/705" + }, + { + "$ref": "#/texts/706" + }, + { + "$ref": "#/texts/707" + }, + { + "$ref": "#/texts/708" + }, + { + "$ref": "#/texts/709" + }, + { + "$ref": "#/texts/710" + }, + { + "$ref": "#/texts/711" + }, + { + "$ref": "#/texts/712" + }, + { + "$ref": "#/texts/713" + }, + { + "$ref": "#/texts/714" + }, + { + "$ref": "#/texts/715" + }, + { + "$ref": "#/texts/716" + }, + { + "$ref": "#/texts/717" + }, + { + "$ref": "#/texts/718" + }, + { + "$ref": "#/texts/719" + }, + { + "$ref": "#/texts/720" + }, + { + "$ref": "#/texts/721" + }, + { + "$ref": "#/texts/722" + }, + { + "$ref": "#/texts/723" + }, + { + "$ref": "#/texts/724" + }, + { + "$ref": "#/texts/725" + }, + { + "$ref": "#/texts/726" + }, + { + "$ref": "#/texts/727" + }, + { + "$ref": "#/texts/728" + }, + { + "$ref": "#/texts/729" + }, + { + "$ref": "#/texts/730" + }, + { + "$ref": "#/texts/731" + }, + { + "$ref": "#/texts/732" + }, + { + "$ref": "#/texts/733" + }, + { + "$ref": "#/texts/734" + }, + { + "$ref": "#/texts/735" + }, + { + "$ref": "#/texts/736" + }, + { + "$ref": "#/texts/737" + }, + { + "$ref": "#/texts/738" + }, + { + "$ref": "#/texts/739" + }, + { + "$ref": "#/texts/740" + }, + { + "$ref": "#/texts/741" + }, + { + "$ref": "#/texts/742" + }, + { + "$ref": "#/texts/743" + }, + { + "$ref": "#/texts/744" + }, + { + "$ref": "#/texts/745" + }, + { + "$ref": "#/texts/746" + }, + { + "$ref": "#/texts/747" + }, + { + "$ref": "#/texts/748" + }, + { + "$ref": "#/texts/749" + }, + { + "$ref": "#/texts/750" + }, + { + "$ref": "#/texts/751" + }, + { + "$ref": "#/texts/752" + }, + { + "$ref": "#/texts/753" + }, + { + "$ref": "#/texts/754" + }, + { + "$ref": "#/texts/755" + }, + { + "$ref": "#/texts/756" + }, + { + "$ref": "#/texts/757" + }, + { + "$ref": "#/texts/758" + }, + { + "$ref": "#/texts/759" + }, + { + "$ref": "#/texts/760" + }, + { + "$ref": "#/texts/761" + }, + { + "$ref": "#/texts/762" + }, + { + "$ref": "#/texts/763" + }, + { + "$ref": "#/texts/764" + }, + { + "$ref": "#/texts/765" + }, + { + "$ref": "#/texts/766" + }, + { + "$ref": "#/texts/767" + }, + { + "$ref": "#/texts/768" + }, + { + "$ref": "#/texts/769" + }, + { + "$ref": "#/texts/770" + }, + { + "$ref": "#/texts/771" + }, + { + "$ref": "#/texts/772" + }, + { + "$ref": "#/texts/773" + }, + { + "$ref": "#/texts/774" + }, + { + "$ref": "#/texts/775" + }, + { + "$ref": "#/texts/776" + }, + { + "$ref": "#/texts/777" + }, + { + "$ref": "#/texts/778" + }, + { + "$ref": "#/texts/779" + }, + { + "$ref": "#/texts/780" + }, + { + "$ref": "#/texts/781" + }, + { + "$ref": "#/texts/782" + }, + { + "$ref": "#/texts/783" + }, + { + "$ref": "#/texts/784" + }, + { + "$ref": "#/texts/785" + }, + { + "$ref": "#/texts/786" + }, + { + "$ref": "#/texts/787" + }, + { + "$ref": "#/texts/788" + }, + { + "$ref": "#/texts/789" + }, + { + "$ref": "#/texts/790" + }, + { + "$ref": "#/texts/791" + }, + { + "$ref": "#/texts/792" + }, + { + "$ref": "#/texts/793" + }, + { + "$ref": "#/texts/794" + }, + { + "$ref": "#/texts/795" + }, + { + "$ref": "#/texts/796" + }, + { + "$ref": "#/texts/797" + }, + { + "$ref": "#/texts/798" + }, + { + "$ref": "#/texts/799" + }, + { + "$ref": "#/texts/800" + }, + { + "$ref": "#/texts/801" + }, + { + "$ref": "#/texts/802" + }, + { + "$ref": "#/texts/803" + }, + { + "$ref": "#/texts/804" + }, + { + "$ref": "#/texts/805" + }, + { + "$ref": "#/texts/806" + }, + { + "$ref": "#/texts/807" + }, + { + "$ref": "#/texts/808" + }, + { + "$ref": "#/texts/809" + }, + { + "$ref": "#/texts/810" + }, + { + "$ref": "#/texts/811" + }, + { + "$ref": "#/texts/812" + }, + { + "$ref": "#/texts/813" + }, + { + "$ref": "#/texts/814" + }, + { + "$ref": "#/texts/815" + }, + { + "$ref": "#/texts/816" + }, + { + "$ref": "#/texts/817" + }, + { + "$ref": "#/texts/818" + }, + { + "$ref": "#/texts/819" + }, + { + "$ref": "#/texts/820" + }, + { + "$ref": "#/texts/821" + }, + { + "$ref": "#/texts/822" + }, + { + "$ref": "#/texts/823" + }, + { + "$ref": "#/texts/824" + }, + { + "$ref": "#/texts/825" + }, + { + "$ref": "#/texts/826" + }, + { + "$ref": "#/texts/827" + }, + { + "$ref": "#/texts/828" + }, + { + "$ref": "#/texts/829" + }, + { + "$ref": "#/texts/830" + }, + { + "$ref": "#/texts/831" + }, + { + "$ref": "#/texts/832" + }, + { + "$ref": "#/texts/833" + }, + { + "$ref": "#/texts/834" + }, + { + "$ref": "#/texts/835" + }, + { + "$ref": "#/texts/836" + }, + { + "$ref": "#/texts/837" + }, + { + "$ref": "#/texts/838" + }, + { + "$ref": "#/texts/839" + }, + { + "$ref": "#/texts/840" + }, + { + "$ref": "#/texts/841" + }, + { + "$ref": "#/texts/842" + }, + { + "$ref": "#/texts/843" + }, + { + "$ref": "#/texts/844" + }, + { + "$ref": "#/texts/845" + }, + { + "$ref": "#/texts/846" + }, + { + "$ref": "#/texts/847" + }, + { + "$ref": "#/texts/848" + }, + { + "$ref": "#/texts/849" + }, + { + "$ref": "#/texts/850" + }, + { + "$ref": "#/texts/851" + }, + { + "$ref": "#/texts/852" + }, + { + "$ref": "#/texts/853" + }, + { + "$ref": "#/texts/854" + }, + { + "$ref": "#/texts/855" + }, + { + "$ref": "#/texts/856" + }, + { + "$ref": "#/texts/857" + }, + { + "$ref": "#/texts/858" + }, + { + "$ref": "#/texts/859" + }, + { + "$ref": "#/texts/860" + }, + { + "$ref": "#/texts/861" + }, + { + "$ref": "#/texts/862" + }, + { + "$ref": "#/texts/863" + }, + { + "$ref": "#/texts/864" + }, + { + "$ref": "#/texts/865" + }, + { + "$ref": "#/texts/866" + }, + { + "$ref": "#/texts/867" + }, + { + "$ref": "#/texts/868" + }, + { + "$ref": "#/texts/869" + }, + { + "$ref": "#/texts/870" + }, + { + "$ref": "#/texts/871" + }, + { + "$ref": "#/texts/872" + }, + { + "$ref": "#/texts/873" + }, + { + "$ref": "#/texts/874" + }, + { + "$ref": "#/texts/875" + }, + { + "$ref": "#/texts/876" + }, + { + "$ref": "#/texts/877" + }, + { + "$ref": "#/texts/878" + }, + { + "$ref": "#/texts/879" + }, + { + "$ref": "#/texts/880" + }, + { + "$ref": "#/texts/881" + }, + { + "$ref": "#/texts/882" + }, + { + "$ref": "#/texts/883" + }, + { + "$ref": "#/texts/884" + }, + { + "$ref": "#/texts/885" + }, + { + "$ref": "#/texts/886" + }, + { + "$ref": "#/texts/887" + }, + { + "$ref": "#/texts/888" + }, + { + "$ref": "#/texts/889" + }, + { + "$ref": "#/texts/890" + }, + { + "$ref": "#/texts/891" + }, + { + "$ref": "#/texts/892" }, { - "page_no": 9, - "bbox": { - "l": 335.152, - "t": 573.216, - "r": 521.726, - "b": 570.514, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 147, - 294 - ] + "$ref": "#/texts/893" + }, + { + "$ref": "#/texts/894" + }, + { + "$ref": "#/texts/895" + }, + { + "$ref": "#/texts/896" + }, + { + "$ref": "#/texts/897" + }, + { + "$ref": "#/texts/898" + }, + { + "$ref": "#/texts/899" + }, + { + "$ref": "#/texts/900" + }, + { + "$ref": "#/texts/901" + }, + { + "$ref": "#/texts/902" + }, + { + "$ref": "#/texts/903" + }, + { + "$ref": "#/texts/904" + }, + { + "$ref": "#/texts/905" } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "annotated pages, from which we obtain accuracy ranges.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/697", - "parent": { - "$ref": "#/body" - }, - "children": [], + ], "content_layer": "body", - "label": "text", + "meta": {}, + "label": "picture", "prov": [ { "page_no": 9, "bbox": { - "l": 335.152, - "t": 564.097, - "r": 408.543, - "b": 561.395, + "l": 334.4932861328125, + "t": 558.5665130615234, + "r": 544.7938842773438, + "b": 414.31744384765625, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, - 54 + 0 ] } ] @@ -4893,6 +7679,249 @@ } } }, + { + "text": "In this image there is a table with some text on it.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/pictures/12", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/915" + }, + { + "$ref": "#/texts/916" + }, + { + "$ref": "#/texts/917" + }, + { + "$ref": "#/texts/918" + }, + { + "$ref": "#/texts/919" + }, + { + "$ref": "#/texts/920" + }, + { + "$ref": "#/texts/921" + }, + { + "$ref": "#/texts/922" + }, + { + "$ref": "#/texts/923" + }, + { + "$ref": "#/texts/924" + }, + { + "$ref": "#/texts/925" + }, + { + "$ref": "#/texts/926" + }, + { + "$ref": "#/texts/927" + }, + { + "$ref": "#/texts/928" + }, + { + "$ref": "#/texts/929" + }, + { + "$ref": "#/texts/930" + }, + { + "$ref": "#/texts/931" + }, + { + "$ref": "#/texts/932" + }, + { + "$ref": "#/texts/933" + }, + { + "$ref": "#/texts/934" + }, + { + "$ref": "#/texts/935" + }, + { + "$ref": "#/texts/936" + }, + { + "$ref": "#/texts/937" + }, + { + "$ref": "#/texts/938" + }, + { + "$ref": "#/texts/939" + }, + { + "$ref": "#/texts/940" + }, + { + "$ref": "#/texts/941" + }, + { + "$ref": "#/texts/942" + }, + { + "$ref": "#/texts/943" + }, + { + "$ref": "#/texts/944" + }, + { + "$ref": "#/texts/945" + }, + { + "$ref": "#/texts/946" + }, + { + "$ref": "#/texts/947" + }, + { + "$ref": "#/texts/948" + }, + { + "$ref": "#/texts/949" + }, + { + "$ref": "#/texts/950" + }, + { + "$ref": "#/texts/951" + }, + { + "$ref": "#/texts/952" + }, + { + "$ref": "#/texts/953" + }, + { + "$ref": "#/texts/954" + }, + { + "$ref": "#/texts/955" + }, + { + "$ref": "#/texts/956" + }, + { + "$ref": "#/texts/957" + }, + { + "$ref": "#/texts/958" + }, + { + "$ref": "#/texts/959" + }, + { + "$ref": "#/texts/960" + }, + { + "$ref": "#/texts/961" + }, + { + "$ref": "#/texts/962" + }, + { + "$ref": "#/texts/963" + }, + { + "$ref": "#/texts/964" + }, + { + "$ref": "#/texts/965" + }, + { + "$ref": "#/texts/966" + }, + { + "$ref": "#/texts/967" + }, + { + "$ref": "#/texts/968" + }, + { + "$ref": "#/texts/969" + }, + { + "$ref": "#/texts/970" + }, + { + "$ref": "#/texts/971" + }, + { + "$ref": "#/texts/972" + }, + { + "$ref": "#/texts/973" + }, + { + "$ref": "#/texts/974" + }, + { + "$ref": "#/texts/975" + }, + { + "$ref": "#/texts/976" + }, + { + "$ref": "#/texts/977" + }, + { + "$ref": "#/texts/978" + }, + { + "$ref": "#/texts/979" + }, + { + "$ref": "#/texts/980" + } + ], + "content_layer": "body", + "meta": {}, + "label": "picture", + "prov": [ + { + "page_no": 9, + "bbox": { + "l": 108.79005432128906, + "t": 467.1181335449219, + "r": 329.1195068359375, + "b": 308.97198486328125, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, { "text": "we distributed the annotation workload and performed continuous be annotated. We refrained from class labels that are very specific", "meta": { diff --git a/test/data/chunker/0b_out_chunks.json b/test/data/chunker/0b_out_chunks.json index 87597d93..27ec0ce9 100644 --- a/test/data/chunker/0b_out_chunks.json +++ b/test/data/chunker/0b_out_chunks.json @@ -1,5 +1,45 @@ { "root": [ + { + "text": "In this image we can see a cartoon image of a duck holding a paper.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/pictures/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "meta": {}, + "label": "picture", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 261.966552734375, + "t": 715.8966522216797, + "r": 348.65899658203125, + "b": 627.1333770751953, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, { "text": "Version 1.0", "meta": { @@ -813,11 +853,86 @@ } }, { - "text": "Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.", + "text": "In this image, we can see some text and images.\n\nFigure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.", "meta": { "schema_name": "docling_core.transforms.chunker.DocMeta", "version": "1.0.0", "doc_items": [ + { + "self_ref": "#/pictures/1", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/31" + }, + { + "$ref": "#/texts/32" + }, + { + "$ref": "#/texts/33" + }, + { + "$ref": "#/texts/34" + }, + { + "$ref": "#/texts/35" + }, + { + "$ref": "#/texts/36" + }, + { + "$ref": "#/texts/37" + }, + { + "$ref": "#/texts/38" + }, + { + "$ref": "#/texts/39" + }, + { + "$ref": "#/texts/40" + }, + { + "$ref": "#/texts/41" + }, + { + "$ref": "#/texts/42" + }, + { + "$ref": "#/texts/43" + }, + { + "$ref": "#/texts/44" + }, + { + "$ref": "#/texts/45" + }, + { + "$ref": "#/texts/46" + } + ], + "content_layer": "body", + "meta": {}, + "label": "picture", + "prov": [ + { + "page_no": 3, + "bbox": { + "l": 110.07231140136719, + "t": 719.2913360595703, + "r": 500.7577209472656, + "b": 581.2926177978516, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + }, { "self_ref": "#/texts/31", "parent": { @@ -3146,1210 +3261,3212 @@ } }, { - "text": "AGL Energy Limited ABN 74 1", + "text": "In this image there is a table with some text on it.", "meta": { "schema_name": "docling_core.transforms.chunker.DocMeta", "version": "1.0.0", "doc_items": [ { - "self_ref": "#/texts/393", + "self_ref": "#/pictures/2", "parent": { "$ref": "#/body" }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "children": [ { - "page_no": 7, - "bbox": { - "l": 226.786, - "t": 560.516, - "r": 233.176, - "b": 559.937, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 28 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACM Reference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "5 061 375", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/394", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/129" + }, { - "page_no": 7, - "bbox": { - "l": 233.40500000000003, - "t": 560.516, - "r": 235.66499999999996, - "b": 559.937, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 9 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACM Reference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Figure 1: Four examples of complex page layouts across different document categories", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/503", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/130" + }, { - "page_no": 7, - "bbox": { - "l": 222.539, - "t": 499.2799999999999, - "r": 312.251, - "b": 490.75200000000007, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 84 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACM Reference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "PDF document conversion, layout segmentation, object-detection, data set, Machine Learning", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/505", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/131" + }, { - "page_no": 7, - "bbox": { - "l": 222.539, - "t": 474.62299999999993, - "r": 312.021, - "b": 465.961, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 90 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "KEYWORDS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/507", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/132" + }, { - "page_no": 7, - "bbox": { - "l": 222.539, - "t": 458.719, - "r": 312.156, - "b": 436.156, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 374 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "1 INTRODUCTION", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/508", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/133" + }, { - "page_no": 7, - "bbox": { - "l": 329.602, - "t": 428.537, - "r": 373.375, - "b": 423.963, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 14 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1. Figure 2: Title page of the DocLayNet paper (arxiv.org/pdf/2206.01062) - left PDF, right rendered Markdown. If recognized, metadata such as authors are appearing first under the title. Text content inside figures is currently dropped, the caption is retained and linked to the figure in the JSON representation (not shown).", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/509", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/134" + }, { - "page_no": 7, - "bbox": { - "l": 108.0, - "t": 419.051, - "r": 527.591, - "b": 377.77099999999996, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 1026 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/511", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/135" + }, { - "page_no": 8, - "bbox": { - "l": 122.99899999999998, - "t": 563.105, - "r": 338.603, - "b": 558.655, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 130 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/512", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/136" + }, { - "page_no": 8, - "bbox": { - "l": 122.87200000000001, - "t": 552.103, - "r": 226.37599999999998, - "b": 509.485, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 489 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "| | human | MRCNN R50 R101 | FRCNN R101 | YOLO v5x6 |\n|--------------------------------------------------------------------------------------------------------|-------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------|--------------------------------------------------------|\n| Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All | 84-89 83-91 83-85 87-88 93-94 85-89 69-71 83-84 77-81 84-86 | 68.4 71.5 70.9 60.1 63.4 81.2 80.8 61.6 59.3 71.9 70.0 71.7 72.7 67.6 69.3 82.2 82.9 85.8 76.7 80.4 72.4 73.5 | 70.1 73.7 63.5 81.0 58.9 72.0 72.0 68.4 82.2 85.4 79.9 73.4 | 77.7 77.2 66.2 86.2 61.1 67.9 74.6 86.3 88.1 82.7 76.8 |", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/tables/1", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "table", - "prov": [ + "$ref": "#/texts/137" + }, { - "page_no": 8, - "bbox": { - "l": 125.8864517211914, - "t": 505.50439453125, - "r": 223.0050506591797, - "b": 437.8017272949219, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/513", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/138" + }, { - "page_no": 8, - "bbox": { - "l": 122.884, - "t": 431.161, - "r": 226.336, - "b": 341.5470000000001, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 1252 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/515", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/139" + }, { - "page_no": 8, - "bbox": { - "l": 122.86499999999998, - "t": 327.581, - "r": 226.282, - "b": 284.81, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 584 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "5 EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Third, achienec", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/516", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/140" + }, { - "page_no": 8, - "bbox": { - "l": 436.0, - "t": 447.0, - "r": 509.66666666666663, - "b": 418.66666666666663, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 15 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "5 EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "chalenongayouls ground-vuth dawa such WC", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/518", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/141" + }, { - "page_no": 8, - "bbox": { - "l": 366.0, - "t": 386.0, - "r": 529.3333333333334, - "b": 375.33333333333337, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 40 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/519", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/142" + }, { - "page_no": 8, - "bbox": { - "l": 235.911, - "t": 469.97300000000007, - "r": 339.288, - "b": 441.408, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 322 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/520", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/143" + }, { - "page_no": 8, - "bbox": { - "l": 235.911, - "t": 425.568, - "r": 338.603, - "b": 415.587, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 102 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16].", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/521", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/144" + }, { - "page_no": 8, - "bbox": { - "l": 235.776, - "t": 416.19999999999993, - "r": 338.703, - "b": 382.7970000000001, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 397 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "EXPERIMENTS" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 \u00d7 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/523", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/145" + }, { - "page_no": 8, - "bbox": { - "l": 235.823, - "t": 370.85, - "r": 338.7, - "b": 285.921, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 1146 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "coioct dcochon modols", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/524", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/146" + }, { - "page_no": 8, - "bbox": { - "l": 456.6666666666667, - "t": 344.0, - "r": 485.33333333333337, - "b": 341.33333333333337, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 21 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "mak enbrel", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/526", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/147" + }, { - "page_no": 8, - "bbox": { - "l": 470.6666666666667, - "t": 308.6666666666667, - "r": 524.0, - "b": 285.3333333333333, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 10 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Figure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/527", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/148" + }, { - "page_no": 8, - "bbox": { - "l": 108.0, - "t": 266.424, - "r": 504.00300000000004, - "b": 225.14499999999998, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 393 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "KDD '22, August 14-18, 2022, Washington, DC, USA", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/529", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/149" + }, { - "page_no": 9, - "bbox": { - "l": 88.676, - "t": 598.985, - "r": 186.95, - "b": 593.669, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 48 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/530", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/150" + }, { - "page_no": 9, - "bbox": { - "l": 190.471, - "t": 598.985, - "r": 346.254, - "b": 593.669, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 81 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as %", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/531", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/151" + }, { - "page_no": 9, - "bbox": { - "l": 88.525, - "t": 586.821, - "r": 346.401, - "b": 580.676, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 123 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/532", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/152" + }, { - "page_no": 9, - "bbox": { - "l": 88.676, - "t": 575.628, - "r": 301.135, - "b": 569.484, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 99 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/533", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/153" + }, { - "page_no": 9, - "bbox": { - "l": 88.676, - "t": 581.225, - "r": 346.254, - "b": 575.08, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 124 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n|----------------|---------|--------------|--------------|--------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|\n| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten |\n| Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a |\n| Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 |\n| Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a |\n| List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 |\n| Page-footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 |\n| Page-header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 |\n| Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 |\n| Section-header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 |\n| Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 |\n| Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 |\n| Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 |\n| Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 |", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/tables/3", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "table", - "prov": [ + "$ref": "#/texts/154" + }, { - "page_no": 9, - "bbox": { - "l": 110.8309097290039, - "t": 560.6356811523438, - "r": 323.92962646484375, - "b": 477.741455078125, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 0 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "include publication repositories such as arXiv", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/695", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ + "$ref": "#/texts/155" + }, { - "page_no": 9, - "bbox": { - "l": 223.57, - "t": 471.407, - "r": 306.847, - "b": 465.079, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 46 - ] - } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { + "$ref": "#/texts/156" + }, + { + "$ref": "#/texts/157" + }, + { + "$ref": "#/texts/158" + }, + { + "$ref": "#/texts/159" + }, + { + "$ref": "#/texts/160" + }, + { + "$ref": "#/texts/161" + }, + { + "$ref": "#/texts/162" + }, + { + "$ref": "#/texts/163" + }, + { + "$ref": "#/texts/164" + }, + { + "$ref": "#/texts/165" + }, + { + "$ref": "#/texts/166" + }, + { + "$ref": "#/texts/167" + }, + { + "$ref": "#/texts/168" + }, + { + "$ref": "#/texts/169" + }, + { + "$ref": "#/texts/170" + }, + { + "$ref": "#/texts/171" + }, + { + "$ref": "#/texts/172" + }, + { + "$ref": "#/texts/173" + }, + { + "$ref": "#/texts/174" + }, + { + "$ref": "#/texts/175" + }, + { + "$ref": "#/texts/176" + }, + { + "$ref": "#/texts/177" + }, + { + "$ref": "#/texts/178" + }, + { + "$ref": "#/texts/179" + }, + { + "$ref": "#/texts/180" + }, + { + "$ref": "#/texts/181" + }, + { + "$ref": "#/texts/182" + }, + { + "$ref": "#/texts/183" + }, + { + "$ref": "#/texts/184" + }, + { + "$ref": "#/texts/185" + }, + { + "$ref": "#/texts/186" + }, + { + "$ref": "#/texts/187" + }, + { + "$ref": "#/texts/188" + }, + { + "$ref": "#/texts/189" + }, + { + "$ref": "#/texts/190" + }, + { + "$ref": "#/texts/191" + }, + { + "$ref": "#/texts/192" + }, + { + "$ref": "#/texts/193" + }, + { + "$ref": "#/texts/194" + }, + { + "$ref": "#/texts/195" + }, + { + "$ref": "#/texts/196" + }, + { + "$ref": "#/texts/197" + }, + { + "$ref": "#/texts/198" + }, + { + "$ref": "#/texts/199" + }, + { + "$ref": "#/texts/200" + }, + { + "$ref": "#/texts/201" + }, + { + "$ref": "#/texts/202" + }, + { + "$ref": "#/texts/203" + }, + { + "$ref": "#/texts/204" + }, + { + "$ref": "#/texts/205" + }, + { + "$ref": "#/texts/206" + }, + { + "$ref": "#/texts/207" + }, + { + "$ref": "#/texts/208" + }, + { + "$ref": "#/texts/209" + }, + { + "$ref": "#/texts/210" + }, + { + "$ref": "#/texts/211" + }, + { + "$ref": "#/texts/212" + }, + { + "$ref": "#/texts/213" + }, + { + "$ref": "#/texts/214" + }, + { + "$ref": "#/texts/215" + }, + { + "$ref": "#/texts/216" + }, + { + "$ref": "#/texts/217" + }, + { + "$ref": "#/texts/218" + }, + { + "$ref": "#/texts/219" + }, + { + "$ref": "#/texts/220" + }, + { + "$ref": "#/texts/221" + }, + { + "$ref": "#/texts/222" + }, + { + "$ref": "#/texts/223" + }, + { + "$ref": "#/texts/224" + }, + { + "$ref": "#/texts/225" + }, + { + "$ref": "#/texts/226" + }, + { + "$ref": "#/texts/227" + }, + { + "$ref": "#/texts/228" + }, + { + "$ref": "#/texts/229" + }, + { + "$ref": "#/texts/230" + }, + { + "$ref": "#/texts/231" + }, + { + "$ref": "#/texts/232" + }, + { + "$ref": "#/texts/233" + }, + { + "$ref": "#/texts/234" + }, + { + "$ref": "#/texts/235" + }, + { + "$ref": "#/texts/236" + }, + { + "$ref": "#/texts/237" + }, + { + "$ref": "#/texts/238" + }, + { + "$ref": "#/texts/239" + }, + { + "$ref": "#/texts/240" + }, + { + "$ref": "#/texts/241" + }, + { + "$ref": "#/texts/242" + }, + { + "$ref": "#/texts/243" + }, + { + "$ref": "#/texts/244" + }, + { + "$ref": "#/texts/245" + }, + { + "$ref": "#/texts/246" + }, + { + "$ref": "#/texts/247" + }, + { + "$ref": "#/texts/248" + }, + { + "$ref": "#/texts/249" + }, + { + "$ref": "#/texts/250" + }, + { + "$ref": "#/texts/251" + }, + { + "$ref": "#/texts/252" + }, + { + "$ref": "#/texts/253" + }, + { + "$ref": "#/texts/254" + }, + { + "$ref": "#/texts/255" + }, + { + "$ref": "#/texts/256" + }, + { + "$ref": "#/texts/257" + }, + { + "$ref": "#/texts/258" + }, + { + "$ref": "#/texts/259" + }, + { + "$ref": "#/texts/260" + }, + { + "$ref": "#/texts/261" + }, + { + "$ref": "#/texts/262" + }, + { + "$ref": "#/texts/263" + }, + { + "$ref": "#/texts/264" + }, + { + "$ref": "#/texts/265" + }, + { + "$ref": "#/texts/266" + }, + { + "$ref": "#/texts/267" + }, + { + "$ref": "#/texts/268" + }, + { + "$ref": "#/texts/269" + }, + { + "$ref": "#/texts/270" + }, + { + "$ref": "#/texts/271" + }, + { + "$ref": "#/texts/272" + }, + { + "$ref": "#/texts/273" + }, + { + "$ref": "#/texts/274" + }, + { + "$ref": "#/texts/275" + }, + { + "$ref": "#/texts/276" + }, + { + "$ref": "#/texts/277" + }, + { + "$ref": "#/texts/278" + }, + { + "$ref": "#/texts/279" + }, + { + "$ref": "#/texts/280" + }, + { + "$ref": "#/texts/281" + }, + { + "$ref": "#/texts/282" + }, + { + "$ref": "#/texts/283" + }, + { + "$ref": "#/texts/284" + }, + { + "$ref": "#/texts/285" + }, + { + "$ref": "#/texts/286" + }, + { + "$ref": "#/texts/287" + }, + { + "$ref": "#/texts/288" + }, + { + "$ref": "#/texts/289" + }, + { + "$ref": "#/texts/290" + }, + { + "$ref": "#/texts/291" + }, + { + "$ref": "#/texts/292" + }, + { + "$ref": "#/texts/293" + }, + { + "$ref": "#/texts/294" + }, + { + "$ref": "#/texts/295" + }, + { + "$ref": "#/texts/296" + }, + { + "$ref": "#/texts/297" + }, + { + "$ref": "#/texts/298" + }, + { + "$ref": "#/texts/299" + }, + { + "$ref": "#/texts/300" + }, + { + "$ref": "#/texts/301" + } + ], + "content_layer": "body", + "meta": {}, + "label": "picture", + "prov": [ + { + "page_no": 7, + "bbox": { + "l": 223.45245361328125, + "t": 606.3411560058594, + "r": 277.1462707519531, + "b": 563.2440032958984, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACM Reference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "In this image we can see a text.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/pictures/3", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/302" + }, + { + "$ref": "#/texts/303" + }, + { + "$ref": "#/texts/304" + }, + { + "$ref": "#/texts/305" + }, + { + "$ref": "#/texts/306" + }, + { + "$ref": "#/texts/307" + }, + { + "$ref": "#/texts/308" + }, + { + "$ref": "#/texts/309" + }, + { + "$ref": "#/texts/310" + }, + { + "$ref": "#/texts/311" + }, + { + "$ref": "#/texts/312" + }, + { + "$ref": "#/texts/313" + }, + { + "$ref": "#/texts/314" + }, + { + "$ref": "#/texts/315" + }, + { + "$ref": "#/texts/316" + }, + { + "$ref": "#/texts/317" + }, + { + "$ref": "#/texts/318" + }, + { + "$ref": "#/texts/319" + }, + { + "$ref": "#/texts/320" + }, + { + "$ref": "#/texts/321" + }, + { + "$ref": "#/texts/322" + }, + { + "$ref": "#/texts/323" + }, + { + "$ref": "#/texts/324" + }, + { + "$ref": "#/texts/325" + }, + { + "$ref": "#/texts/326" + }, + { + "$ref": "#/texts/327" + }, + { + "$ref": "#/texts/328" + }, + { + "$ref": "#/texts/329" + }, + { + "$ref": "#/texts/330" + }, + { + "$ref": "#/texts/331" + }, + { + "$ref": "#/texts/332" + }, + { + "$ref": "#/texts/333" + }, + { + "$ref": "#/texts/334" + }, + { + "$ref": "#/texts/335" + }, + { + "$ref": "#/texts/336" + }, + { + "$ref": "#/texts/337" + }, + { + "$ref": "#/texts/338" + }, + { + "$ref": "#/texts/339" + }, + { + "$ref": "#/texts/340" + }, + { + "$ref": "#/texts/341" + }, + { + "$ref": "#/texts/342" + }, + { + "$ref": "#/texts/343" + }, + { + "$ref": "#/texts/344" + }, + { + "$ref": "#/texts/345" + }, + { + "$ref": "#/texts/346" + }, + { + "$ref": "#/texts/347" + }, + { + "$ref": "#/texts/348" + }, + { + "$ref": "#/texts/349" + }, + { + "$ref": "#/texts/350" + }, + { + "$ref": "#/texts/351" + }, + { + "$ref": "#/texts/352" + }, + { + "$ref": "#/texts/353" + }, + { + "$ref": "#/texts/354" + }, + { + "$ref": "#/texts/355" + }, + { + "$ref": "#/texts/356" + }, + { + "$ref": "#/texts/357" + }, + { + "$ref": "#/texts/358" + }, + { + "$ref": "#/texts/359" + }, + { + "$ref": "#/texts/360" + }, + { + "$ref": "#/texts/361" + }, + { + "$ref": "#/texts/362" + }, + { + "$ref": "#/texts/363" + }, + { + "$ref": "#/texts/364" + }, + { + "$ref": "#/texts/365" + }, + { + "$ref": "#/texts/366" + }, + { + "$ref": "#/texts/367" + }, + { + "$ref": "#/texts/368" + }, + { + "$ref": "#/texts/369" + }, + { + "$ref": "#/texts/370" + }, + { + "$ref": "#/texts/371" + }, + { + "$ref": "#/texts/372" + }, + { + "$ref": "#/texts/373" + }, + { + "$ref": "#/texts/374" + }, + { + "$ref": "#/texts/375" + }, + { + "$ref": "#/texts/376" + }, + { + "$ref": "#/texts/377" + }, + { + "$ref": "#/texts/378" + }, + { + "$ref": "#/texts/379" + }, + { + "$ref": "#/texts/380" + }, + { + "$ref": "#/texts/381" + }, + { + "$ref": "#/texts/382" + }, + { + "$ref": "#/texts/383" + }, + { + "$ref": "#/texts/384" + }, + { + "$ref": "#/texts/385" + }, + { + "$ref": "#/texts/386" + }, + { + "$ref": "#/texts/387" + }, + { + "$ref": "#/texts/388" + }, + { + "$ref": "#/texts/389" + }, + { + "$ref": "#/texts/390" + }, + { + "$ref": "#/texts/391" + }, + { + "$ref": "#/texts/392" + } + ], + "content_layer": "body", + "meta": {}, + "label": "picture", + "prov": [ + { + "page_no": 7, + "bbox": { + "l": 279.03204345703125, + "t": 607.0251770019531, + "r": 312.2338562011719, + "b": 562.7499389648438, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACM Reference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "AGL Energy Limited ABN 74 1", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/393", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 7, + "bbox": { + "l": 226.786, + "t": 560.516, + "r": 233.176, + "b": 559.937, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 28 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACM Reference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "5 061 375", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/394", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 7, + "bbox": { + "l": 233.40500000000003, + "t": 560.516, + "r": 235.66499999999996, + "b": 559.937, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 9 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACM Reference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "In this image I can see the text on the image.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/pictures/4", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/395" + }, + { + "$ref": "#/texts/396" + }, + { + "$ref": "#/texts/397" + }, + { + "$ref": "#/texts/398" + }, + { + "$ref": "#/texts/399" + }, + { + "$ref": "#/texts/400" + }, + { + "$ref": "#/texts/401" + }, + { + "$ref": "#/texts/402" + }, + { + "$ref": "#/texts/403" + }, + { + "$ref": "#/texts/404" + }, + { + "$ref": "#/texts/405" + }, + { + "$ref": "#/texts/406" + }, + { + "$ref": "#/texts/407" + }, + { + "$ref": "#/texts/408" + }, + { + "$ref": "#/texts/409" + }, + { + "$ref": "#/texts/410" + }, + { + "$ref": "#/texts/411" + }, + { + "$ref": "#/texts/412" + }, + { + "$ref": "#/texts/413" + }, + { + "$ref": "#/texts/414" + }, + { + "$ref": "#/texts/415" + }, + { + "$ref": "#/texts/416" + }, + { + "$ref": "#/texts/417" + }, + { + "$ref": "#/texts/418" + }, + { + "$ref": "#/texts/419" + }, + { + "$ref": "#/texts/420" + }, + { + "$ref": "#/texts/421" + }, + { + "$ref": "#/texts/422" + }, + { + "$ref": "#/texts/423" + }, + { + "$ref": "#/texts/424" + }, + { + "$ref": "#/texts/425" + }, + { + "$ref": "#/texts/426" + }, + { + "$ref": "#/texts/427" + }, + { + "$ref": "#/texts/428" + }, + { + "$ref": "#/texts/429" + }, + { + "$ref": "#/texts/430" + }, + { + "$ref": "#/texts/431" + }, + { + "$ref": "#/texts/432" + }, + { + "$ref": "#/texts/433" + }, + { + "$ref": "#/texts/434" + }, + { + "$ref": "#/texts/435" + }, + { + "$ref": "#/texts/436" + }, + { + "$ref": "#/texts/437" + }, + { + "$ref": "#/texts/438" + }, + { + "$ref": "#/texts/439" + }, + { + "$ref": "#/texts/440" + } + ], + "content_layer": "body", + "meta": {}, + "label": "picture", + "prov": [ + { + "page_no": 7, + "bbox": { + "l": 224.6795196533203, + "t": 559.731201171875, + "r": 268.13018798828125, + "b": 503.4937438964844, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACM Reference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "In this image there is a paper with some text on it.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/pictures/5", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/441" + }, + { + "$ref": "#/texts/442" + }, + { + "$ref": "#/texts/443" + }, + { + "$ref": "#/texts/444" + }, + { + "$ref": "#/texts/445" + }, + { + "$ref": "#/texts/446" + }, + { + "$ref": "#/texts/447" + }, + { + "$ref": "#/texts/448" + }, + { + "$ref": "#/texts/449" + }, + { + "$ref": "#/texts/450" + }, + { + "$ref": "#/texts/451" + }, + { + "$ref": "#/texts/452" + }, + { + "$ref": "#/texts/453" + }, + { + "$ref": "#/texts/454" + }, + { + "$ref": "#/texts/455" + }, + { + "$ref": "#/texts/456" + }, + { + "$ref": "#/texts/457" + }, + { + "$ref": "#/texts/458" + }, + { + "$ref": "#/texts/459" + }, + { + "$ref": "#/texts/460" + }, + { + "$ref": "#/texts/461" + }, + { + "$ref": "#/texts/462" + }, + { + "$ref": "#/texts/463" + }, + { + "$ref": "#/texts/464" + }, + { + "$ref": "#/texts/465" + }, + { + "$ref": "#/texts/466" + }, + { + "$ref": "#/texts/467" + }, + { + "$ref": "#/texts/468" + }, + { + "$ref": "#/texts/469" + }, + { + "$ref": "#/texts/470" + }, + { + "$ref": "#/texts/471" + }, + { + "$ref": "#/texts/472" + }, + { + "$ref": "#/texts/473" + }, + { + "$ref": "#/texts/474" + }, + { + "$ref": "#/texts/475" + }, + { + "$ref": "#/texts/476" + }, + { + "$ref": "#/texts/477" + }, + { + "$ref": "#/texts/478" + }, + { + "$ref": "#/texts/479" + }, + { + "$ref": "#/texts/480" + }, + { + "$ref": "#/texts/481" + }, + { + "$ref": "#/texts/482" + }, + { + "$ref": "#/texts/483" + }, + { + "$ref": "#/texts/484" + }, + { + "$ref": "#/texts/485" + }, + { + "$ref": "#/texts/486" + }, + { + "$ref": "#/texts/487" + }, + { + "$ref": "#/texts/488" + }, + { + "$ref": "#/texts/489" + }, + { + "$ref": "#/texts/490" + }, + { + "$ref": "#/texts/491" + }, + { + "$ref": "#/texts/492" + }, + { + "$ref": "#/texts/493" + }, + { + "$ref": "#/texts/494" + }, + { + "$ref": "#/texts/495" + }, + { + "$ref": "#/texts/496" + }, + { + "$ref": "#/texts/497" + }, + { + "$ref": "#/texts/498" + }, + { + "$ref": "#/texts/499" + }, + { + "$ref": "#/texts/500" + }, + { + "$ref": "#/texts/501" + }, + { + "$ref": "#/texts/502" + } + ], + "content_layer": "body", + "meta": {}, + "label": "picture", + "prov": [ + { + "page_no": 7, + "bbox": { + "l": 269.2328186035156, + "t": 558.8644409179688, + "r": 311.74884033203125, + "b": 502.994873046875, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACM Reference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Figure 1: Four examples of complex page layouts across different document categories", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/503", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 7, + "bbox": { + "l": 222.539, + "t": 499.2799999999999, + "r": 312.251, + "b": 490.75200000000007, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 84 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACM Reference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "PDF document conversion, layout segmentation, object-detection, data set, Machine Learning", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/505", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 7, + "bbox": { + "l": 222.539, + "t": 474.62299999999993, + "r": 312.021, + "b": 465.961, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 90 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "KEYWORDS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/507", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 7, + "bbox": { + "l": 222.539, + "t": 458.719, + "r": 312.156, + "b": 436.156, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 374 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "1 INTRODUCTION", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/508", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 7, + "bbox": { + "l": 329.602, + "t": 428.537, + "r": 373.375, + "b": 423.963, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 14 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1. Figure 2: Title page of the DocLayNet paper (arxiv.org/pdf/2206.01062) - left PDF, right rendered Markdown. If recognized, metadata such as authors are appearing first under the title. Text content inside figures is currently dropped, the caption is retained and linked to the figure in the JSON representation (not shown).", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/509", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 7, + "bbox": { + "l": 108.0, + "t": 419.051, + "r": 527.591, + "b": 377.77099999999996, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 1026 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/511", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 122.99899999999998, + "t": 563.105, + "r": 338.603, + "b": 558.655, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 130 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/512", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 122.87200000000001, + "t": 552.103, + "r": 226.37599999999998, + "b": 509.485, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 489 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "| | human | MRCNN R50 R101 | FRCNN R101 | YOLO v5x6 |\n|--------------------------------------------------------------------------------------------------------|-------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------|--------------------------------------------------------|\n| Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All | 84-89 83-91 83-85 87-88 93-94 85-89 69-71 83-84 77-81 84-86 | 68.4 71.5 70.9 60.1 63.4 81.2 80.8 61.6 59.3 71.9 70.0 71.7 72.7 67.6 69.3 82.2 82.9 85.8 76.7 80.4 72.4 73.5 | 70.1 73.7 63.5 81.0 58.9 72.0 72.0 68.4 82.2 85.4 79.9 73.4 | 77.7 77.2 66.2 86.2 61.1 67.9 74.6 86.3 88.1 82.7 76.8 |", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/tables/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 125.8864517211914, + "t": 505.50439453125, + "r": 223.0050506591797, + "b": 437.8017272949219, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/513", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 122.884, + "t": 431.161, + "r": 226.336, + "b": 341.5470000000001, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 1252 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/515", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 122.86499999999998, + "t": 327.581, + "r": 226.282, + "b": 284.81, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 584 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "5 EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "In this image, we can see a table.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/pictures/6", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "meta": {}, + "label": "picture", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 366.8663635253906, + "t": 542.9663391113281, + "r": 460.8086242675781, + "b": 450.9350280761719, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "5 EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Third, achienec", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/516", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 436.0, + "t": 447.0, + "r": 509.66666666666663, + "b": 418.66666666666663, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 15 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "5 EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "chalenongayouls ground-vuth dawa such WC", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/518", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 366.0, + "t": 386.0, + "r": 529.3333333333334, + "b": 375.33333333333337, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 40 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "The image is a line graph that shows the percentage of respondents who have used a specific type of training in the past 24 hours. The x-axis represents the time in hours, while the y-axis represents the percentage of respondents. The graph is titled \"Training.\"\n\nThe graph has two lines: one for the training type and one for the training duration. The training type line is labeled \"Training\" and the training duration line is labeled \"Training Duration.\"\n\nThe graph shows that the percentage of respondents who have used the training type increases as the time increases. Specifically, the percentage of respondents who have used the training type increases from 10% to 20% over the 24-hour period. The percentage of respondents who have used the training duration increases from 10% to 20% over the 24-hour period.\n\nThe graph also shows that the percentage of respondents who have used the training type increases as the", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/pictures/7", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "meta": {}, + "label": "picture", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 237.6404266357422, + "t": 550.1458740234375, + "r": 337.0112609863281, + "b": 477.0093078613281, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/519", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 235.911, + "t": 469.97300000000007, + "r": 339.288, + "b": 441.408, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 322 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/520", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 235.911, + "t": 425.568, + "r": 338.603, + "b": 415.587, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 102 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16].", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/521", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 235.776, + "t": 416.19999999999993, + "r": 338.703, + "b": 382.7970000000001, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 397 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "EXPERIMENTS" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 \u00d7 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/523", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 235.823, + "t": 370.85, + "r": 338.7, + "b": 285.921, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 1146 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "coioct dcochon modols", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/524", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 456.6666666666667, + "t": 344.0, + "r": 485.33333333333337, + "b": 341.33333333333337, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 21 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "mak enbrel", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/526", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 470.6666666666667, + "t": 308.6666666666667, + "r": 524.0, + "b": 285.3333333333333, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 10 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Figure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/527", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 8, + "bbox": { + "l": 108.0, + "t": 266.424, + "r": 504.00300000000004, + "b": 225.14499999999998, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 393 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "KDD '22, August 14-18, 2022, Washington, DC, USA", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/529", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 9, + "bbox": { + "l": 88.676, + "t": 598.985, + "r": 186.95, + "b": 593.669, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 48 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/530", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 9, + "bbox": { + "l": 190.471, + "t": 598.985, + "r": 346.254, + "b": 593.669, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 81 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as %", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/531", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 9, + "bbox": { + "l": 88.525, + "t": 586.821, + "r": 346.401, + "b": 580.676, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 123 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/532", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 9, + "bbox": { + "l": 88.676, + "t": 575.628, + "r": 301.135, + "b": 569.484, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 99 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/533", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 9, + "bbox": { + "l": 88.676, + "t": 581.225, + "r": 346.254, + "b": 575.08, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 124 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "The image consists of a blue circle with the letter \"A\" written in white color. The circle is placed on a white background, which makes the letter \"A\" stand out prominently. The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom. The gradient effect gives a sense of depth and movement, making the letter \"A\" stand out more.\n\n### Description of Objects in the Image:\n1. **Circle**: The central element of the image is a blue circle.\n2. **White Letter \"A\"**: The letter \"A\" is written in white color.\n3. **Background**: The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom.\n4. **Color Gradient**: The gradient effect is present, with the color transitioning from a lighter shade at the top to a darker shade at the bottom.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/pictures/8", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/534" + } + ], + "content_layer": "body", + "meta": {}, + "label": "picture", + "prov": [ + { + "page_no": 9, + "bbox": { + "l": 110.43017578125, + "t": 573.9806060791016, + "r": 124.71578216552734, + "b": 559.4710540771484, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "In this image, there is a table with two columns. The first column is labeled \"Class label,\" and the second column is labeled \"Count.\" The first row in the table has the label \"Class label,\" and the count is 22524. The second row in the table has the label \"Count,\" and the count is 204.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/pictures/9", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/535" + }, + { + "$ref": "#/texts/536" + }, + { + "$ref": "#/texts/537" + }, + { + "$ref": "#/texts/538" + }, + { + "$ref": "#/texts/539" + }, + { + "$ref": "#/texts/540" + }, + { + "$ref": "#/texts/541" + }, + { + "$ref": "#/texts/542" + }, + { + "$ref": "#/texts/543" + }, + { + "$ref": "#/texts/544" + }, + { + "$ref": "#/texts/545" + }, + { + "$ref": "#/texts/546" + }, + { + "$ref": "#/texts/547" + }, + { + "$ref": "#/texts/548" + }, + { + "$ref": "#/texts/549" + }, + { + "$ref": "#/texts/550" + }, + { + "$ref": "#/texts/551" + }, + { + "$ref": "#/texts/552" + }, + { + "$ref": "#/texts/553" + }, + { + "$ref": "#/texts/554" + }, + { + "$ref": "#/texts/555" + }, + { + "$ref": "#/texts/556" + }, + { + "$ref": "#/texts/557" + }, + { + "$ref": "#/texts/558" + }, + { + "$ref": "#/texts/559" + }, + { + "$ref": "#/texts/560" + }, + { + "$ref": "#/texts/561" + }, + { + "$ref": "#/texts/562" + }, + { + "$ref": "#/texts/563" + }, + { + "$ref": "#/texts/564" + }, + { + "$ref": "#/texts/565" + }, + { + "$ref": "#/texts/566" + }, + { + "$ref": "#/texts/567" + }, + { + "$ref": "#/texts/568" + }, + { + "$ref": "#/texts/569" + }, + { + "$ref": "#/texts/570" + }, + { + "$ref": "#/texts/571" + }, + { + "$ref": "#/texts/572" + }, + { + "$ref": "#/texts/573" + }, + { + "$ref": "#/texts/574" + }, + { + "$ref": "#/texts/575" + }, + { + "$ref": "#/texts/576" + }, + { + "$ref": "#/texts/577" + }, + { + "$ref": "#/texts/578" + }, + { + "$ref": "#/texts/579" + }, + { + "$ref": "#/texts/580" + }, + { + "$ref": "#/texts/581" + }, + { + "$ref": "#/texts/582" + }, + { + "$ref": "#/texts/583" + }, + { + "$ref": "#/texts/584" + }, + { + "$ref": "#/texts/585" + }, + { + "$ref": "#/texts/586" + }, + { + "$ref": "#/texts/587" + }, + { + "$ref": "#/texts/588" + }, + { + "$ref": "#/texts/589" + }, + { + "$ref": "#/texts/590" + }, + { + "$ref": "#/texts/591" + }, + { + "$ref": "#/texts/592" + }, + { + "$ref": "#/texts/593" + }, + { + "$ref": "#/texts/594" + }, + { + "$ref": "#/texts/595" + }, + { + "$ref": "#/texts/596" + }, + { + "$ref": "#/texts/597" + }, + { + "$ref": "#/texts/598" + }, + { + "$ref": "#/texts/599" + }, + { + "$ref": "#/texts/600" + }, + { + "$ref": "#/texts/601" + }, + { + "$ref": "#/texts/602" + }, + { + "$ref": "#/texts/603" + }, + { + "$ref": "#/texts/604" + }, + { + "$ref": "#/texts/605" + }, + { + "$ref": "#/texts/606" + }, + { + "$ref": "#/texts/607" + }, + { + "$ref": "#/texts/608" + }, + { + "$ref": "#/texts/609" + }, + { + "$ref": "#/texts/610" + }, + { + "$ref": "#/texts/611" + }, + { + "$ref": "#/texts/612" + }, + { + "$ref": "#/texts/613" + }, + { + "$ref": "#/texts/614" + }, + { + "$ref": "#/texts/615" + }, + { + "$ref": "#/texts/616" + }, + { + "$ref": "#/texts/617" + }, + { + "$ref": "#/texts/618" + }, + { + "$ref": "#/texts/619" + }, + { + "$ref": "#/texts/620" + }, + { + "$ref": "#/texts/621" + }, + { + "$ref": "#/texts/622" + }, + { + "$ref": "#/texts/623" + }, + { + "$ref": "#/texts/624" + }, + { + "$ref": "#/texts/625" + }, + { + "$ref": "#/texts/626" + }, + { + "$ref": "#/texts/627" + }, + { + "$ref": "#/texts/628" + }, + { + "$ref": "#/texts/629" + }, + { + "$ref": "#/texts/630" + }, + { + "$ref": "#/texts/631" + }, + { + "$ref": "#/texts/632" + }, + { + "$ref": "#/texts/633" + }, + { + "$ref": "#/texts/634" + }, + { + "$ref": "#/texts/635" + }, + { + "$ref": "#/texts/636" + }, + { + "$ref": "#/texts/637" + }, + { + "$ref": "#/texts/638" + }, + { + "$ref": "#/texts/639" + }, + { + "$ref": "#/texts/640" + }, + { + "$ref": "#/texts/641" + }, + { + "$ref": "#/texts/642" + }, + { + "$ref": "#/texts/643" + }, + { + "$ref": "#/texts/644" + }, + { + "$ref": "#/texts/645" + }, + { + "$ref": "#/texts/646" + }, + { + "$ref": "#/texts/647" + }, + { + "$ref": "#/texts/648" + }, + { + "$ref": "#/texts/649" + }, + { + "$ref": "#/texts/650" + }, + { + "$ref": "#/texts/651" + }, + { + "$ref": "#/texts/652" + }, + { + "$ref": "#/texts/653" + }, + { + "$ref": "#/texts/654" + }, + { + "$ref": "#/texts/655" + }, + { + "$ref": "#/texts/656" + }, + { + "$ref": "#/texts/657" + }, + { + "$ref": "#/texts/658" + }, + { + "$ref": "#/texts/659" + }, + { + "$ref": "#/texts/660" + }, + { + "$ref": "#/texts/661" + }, + { + "$ref": "#/texts/662" + }, + { + "$ref": "#/texts/663" + }, + { + "$ref": "#/texts/664" + }, + { + "$ref": "#/texts/665" + }, + { + "$ref": "#/texts/666" + }, + { + "$ref": "#/texts/667" + }, + { + "$ref": "#/texts/668" + }, + { + "$ref": "#/texts/669" + }, + { + "$ref": "#/texts/670" + }, + { + "$ref": "#/texts/671" + }, + { + "$ref": "#/texts/672" + }, + { + "$ref": "#/texts/673" + }, + { + "$ref": "#/texts/674" + }, + { + "$ref": "#/texts/675" + }, + { + "$ref": "#/texts/676" + }, + { + "$ref": "#/texts/677" + }, + { + "$ref": "#/texts/678" + }, + { + "$ref": "#/texts/679" + }, + { + "$ref": "#/texts/680" + }, + { + "$ref": "#/texts/681" + }, + { + "$ref": "#/texts/682" + }, + { + "$ref": "#/texts/683" + }, + { + "$ref": "#/texts/684" + }, + { + "$ref": "#/texts/685" + }, + { + "$ref": "#/texts/686" + }, + { + "$ref": "#/texts/687" + }, + { + "$ref": "#/texts/688" + }, + { + "$ref": "#/texts/689" + }, + { + "$ref": "#/texts/690" + }, + { + "$ref": "#/texts/691" + }, + { + "$ref": "#/texts/692" + }, + { + "$ref": "#/texts/693" + } + ], + "content_layer": "body", + "meta": {}, + "label": "picture", + "prov": [ + { + "page_no": 9, + "bbox": { + "l": 110.8309097290039, + "t": 560.6356811523438, + "r": 323.92962646484375, + "b": 477.741455078125, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n|----------------|---------|--------------|--------------|--------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|\n| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten |\n| Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a |\n| Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 |\n| Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a |\n| List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 |\n| Page-footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 |\n| Page-header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 |\n| Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 |\n| Section-header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 |\n| Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 |\n| Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 |\n| Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 |\n| Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 |", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/tables/3", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [ + { + "page_no": 9, + "bbox": { + "l": 110.8309097290039, + "t": 560.6356811523438, + "r": 323.92962646484375, + "b": 477.741455078125, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "In this image I can see a blue circle.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/pictures/10", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/694" + } + ], + "content_layer": "body", + "meta": {}, + "label": "picture", + "prov": [ + { + "page_no": 9, + "bbox": { + "l": 332.130615234375, + "t": 576.3017578125, + "r": 346.93829345703125, + "b": 560.4401550292969, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "include publication repositories such as arXiv", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/695", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 9, + "bbox": { + "l": 223.57, + "t": 471.407, + "r": 306.847, + "b": 465.079, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 46 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { "text": "Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row \"Total\") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-", "meta": { "schema_name": "docling_core.transforms.chunker.DocMeta", @@ -4365,74 +6482,743 @@ "label": "text", "prov": [ { - "page_no": 9, - "bbox": { - "l": 335.152, - "t": 573.216, - "r": 521.726, - "b": 570.514, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 146 - ] + "page_no": 9, + "bbox": { + "l": 335.152, + "t": 573.216, + "r": 521.726, + "b": 570.514, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 146 + ] + }, + { + "page_no": 9, + "bbox": { + "l": 335.152, + "t": 573.216, + "r": 521.726, + "b": 570.514, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 147, + 294 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "annotated pages, from which we obtain accuracy ranges.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/texts/697", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 9, + "bbox": { + "l": 335.152, + "t": 564.097, + "r": 408.543, + "b": 561.395, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 54 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, + { + "text": "A table with different columns and rows.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/pictures/11", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/698" + }, + { + "$ref": "#/texts/699" + }, + { + "$ref": "#/texts/700" + }, + { + "$ref": "#/texts/701" + }, + { + "$ref": "#/texts/702" + }, + { + "$ref": "#/texts/703" + }, + { + "$ref": "#/texts/704" + }, + { + "$ref": "#/texts/705" + }, + { + "$ref": "#/texts/706" + }, + { + "$ref": "#/texts/707" + }, + { + "$ref": "#/texts/708" + }, + { + "$ref": "#/texts/709" + }, + { + "$ref": "#/texts/710" + }, + { + "$ref": "#/texts/711" + }, + { + "$ref": "#/texts/712" + }, + { + "$ref": "#/texts/713" + }, + { + "$ref": "#/texts/714" + }, + { + "$ref": "#/texts/715" + }, + { + "$ref": "#/texts/716" + }, + { + "$ref": "#/texts/717" + }, + { + "$ref": "#/texts/718" + }, + { + "$ref": "#/texts/719" + }, + { + "$ref": "#/texts/720" + }, + { + "$ref": "#/texts/721" + }, + { + "$ref": "#/texts/722" + }, + { + "$ref": "#/texts/723" + }, + { + "$ref": "#/texts/724" + }, + { + "$ref": "#/texts/725" + }, + { + "$ref": "#/texts/726" + }, + { + "$ref": "#/texts/727" + }, + { + "$ref": "#/texts/728" + }, + { + "$ref": "#/texts/729" + }, + { + "$ref": "#/texts/730" + }, + { + "$ref": "#/texts/731" + }, + { + "$ref": "#/texts/732" + }, + { + "$ref": "#/texts/733" + }, + { + "$ref": "#/texts/734" + }, + { + "$ref": "#/texts/735" + }, + { + "$ref": "#/texts/736" + }, + { + "$ref": "#/texts/737" + }, + { + "$ref": "#/texts/738" + }, + { + "$ref": "#/texts/739" + }, + { + "$ref": "#/texts/740" + }, + { + "$ref": "#/texts/741" + }, + { + "$ref": "#/texts/742" + }, + { + "$ref": "#/texts/743" + }, + { + "$ref": "#/texts/744" + }, + { + "$ref": "#/texts/745" + }, + { + "$ref": "#/texts/746" + }, + { + "$ref": "#/texts/747" + }, + { + "$ref": "#/texts/748" + }, + { + "$ref": "#/texts/749" + }, + { + "$ref": "#/texts/750" + }, + { + "$ref": "#/texts/751" + }, + { + "$ref": "#/texts/752" + }, + { + "$ref": "#/texts/753" + }, + { + "$ref": "#/texts/754" + }, + { + "$ref": "#/texts/755" + }, + { + "$ref": "#/texts/756" + }, + { + "$ref": "#/texts/757" + }, + { + "$ref": "#/texts/758" + }, + { + "$ref": "#/texts/759" + }, + { + "$ref": "#/texts/760" + }, + { + "$ref": "#/texts/761" + }, + { + "$ref": "#/texts/762" + }, + { + "$ref": "#/texts/763" + }, + { + "$ref": "#/texts/764" + }, + { + "$ref": "#/texts/765" + }, + { + "$ref": "#/texts/766" + }, + { + "$ref": "#/texts/767" + }, + { + "$ref": "#/texts/768" + }, + { + "$ref": "#/texts/769" + }, + { + "$ref": "#/texts/770" + }, + { + "$ref": "#/texts/771" + }, + { + "$ref": "#/texts/772" + }, + { + "$ref": "#/texts/773" + }, + { + "$ref": "#/texts/774" + }, + { + "$ref": "#/texts/775" + }, + { + "$ref": "#/texts/776" + }, + { + "$ref": "#/texts/777" + }, + { + "$ref": "#/texts/778" + }, + { + "$ref": "#/texts/779" + }, + { + "$ref": "#/texts/780" + }, + { + "$ref": "#/texts/781" + }, + { + "$ref": "#/texts/782" + }, + { + "$ref": "#/texts/783" + }, + { + "$ref": "#/texts/784" + }, + { + "$ref": "#/texts/785" + }, + { + "$ref": "#/texts/786" + }, + { + "$ref": "#/texts/787" + }, + { + "$ref": "#/texts/788" + }, + { + "$ref": "#/texts/789" + }, + { + "$ref": "#/texts/790" + }, + { + "$ref": "#/texts/791" + }, + { + "$ref": "#/texts/792" + }, + { + "$ref": "#/texts/793" + }, + { + "$ref": "#/texts/794" + }, + { + "$ref": "#/texts/795" + }, + { + "$ref": "#/texts/796" + }, + { + "$ref": "#/texts/797" + }, + { + "$ref": "#/texts/798" + }, + { + "$ref": "#/texts/799" + }, + { + "$ref": "#/texts/800" + }, + { + "$ref": "#/texts/801" + }, + { + "$ref": "#/texts/802" + }, + { + "$ref": "#/texts/803" + }, + { + "$ref": "#/texts/804" + }, + { + "$ref": "#/texts/805" + }, + { + "$ref": "#/texts/806" + }, + { + "$ref": "#/texts/807" + }, + { + "$ref": "#/texts/808" + }, + { + "$ref": "#/texts/809" + }, + { + "$ref": "#/texts/810" + }, + { + "$ref": "#/texts/811" + }, + { + "$ref": "#/texts/812" + }, + { + "$ref": "#/texts/813" + }, + { + "$ref": "#/texts/814" + }, + { + "$ref": "#/texts/815" + }, + { + "$ref": "#/texts/816" + }, + { + "$ref": "#/texts/817" + }, + { + "$ref": "#/texts/818" + }, + { + "$ref": "#/texts/819" + }, + { + "$ref": "#/texts/820" + }, + { + "$ref": "#/texts/821" + }, + { + "$ref": "#/texts/822" + }, + { + "$ref": "#/texts/823" + }, + { + "$ref": "#/texts/824" + }, + { + "$ref": "#/texts/825" + }, + { + "$ref": "#/texts/826" + }, + { + "$ref": "#/texts/827" + }, + { + "$ref": "#/texts/828" + }, + { + "$ref": "#/texts/829" + }, + { + "$ref": "#/texts/830" + }, + { + "$ref": "#/texts/831" + }, + { + "$ref": "#/texts/832" + }, + { + "$ref": "#/texts/833" + }, + { + "$ref": "#/texts/834" + }, + { + "$ref": "#/texts/835" + }, + { + "$ref": "#/texts/836" + }, + { + "$ref": "#/texts/837" + }, + { + "$ref": "#/texts/838" + }, + { + "$ref": "#/texts/839" + }, + { + "$ref": "#/texts/840" + }, + { + "$ref": "#/texts/841" + }, + { + "$ref": "#/texts/842" + }, + { + "$ref": "#/texts/843" + }, + { + "$ref": "#/texts/844" + }, + { + "$ref": "#/texts/845" + }, + { + "$ref": "#/texts/846" + }, + { + "$ref": "#/texts/847" + }, + { + "$ref": "#/texts/848" + }, + { + "$ref": "#/texts/849" + }, + { + "$ref": "#/texts/850" + }, + { + "$ref": "#/texts/851" + }, + { + "$ref": "#/texts/852" + }, + { + "$ref": "#/texts/853" + }, + { + "$ref": "#/texts/854" + }, + { + "$ref": "#/texts/855" + }, + { + "$ref": "#/texts/856" + }, + { + "$ref": "#/texts/857" + }, + { + "$ref": "#/texts/858" + }, + { + "$ref": "#/texts/859" + }, + { + "$ref": "#/texts/860" + }, + { + "$ref": "#/texts/861" + }, + { + "$ref": "#/texts/862" + }, + { + "$ref": "#/texts/863" + }, + { + "$ref": "#/texts/864" + }, + { + "$ref": "#/texts/865" + }, + { + "$ref": "#/texts/866" + }, + { + "$ref": "#/texts/867" + }, + { + "$ref": "#/texts/868" + }, + { + "$ref": "#/texts/869" + }, + { + "$ref": "#/texts/870" + }, + { + "$ref": "#/texts/871" + }, + { + "$ref": "#/texts/872" + }, + { + "$ref": "#/texts/873" + }, + { + "$ref": "#/texts/874" + }, + { + "$ref": "#/texts/875" + }, + { + "$ref": "#/texts/876" + }, + { + "$ref": "#/texts/877" + }, + { + "$ref": "#/texts/878" + }, + { + "$ref": "#/texts/879" + }, + { + "$ref": "#/texts/880" + }, + { + "$ref": "#/texts/881" + }, + { + "$ref": "#/texts/882" + }, + { + "$ref": "#/texts/883" + }, + { + "$ref": "#/texts/884" + }, + { + "$ref": "#/texts/885" + }, + { + "$ref": "#/texts/886" + }, + { + "$ref": "#/texts/887" + }, + { + "$ref": "#/texts/888" + }, + { + "$ref": "#/texts/889" + }, + { + "$ref": "#/texts/890" + }, + { + "$ref": "#/texts/891" + }, + { + "$ref": "#/texts/892" }, { - "page_no": 9, - "bbox": { - "l": 335.152, - "t": 573.216, - "r": 521.726, - "b": 570.514, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 147, - 294 - ] + "$ref": "#/texts/893" + }, + { + "$ref": "#/texts/894" + }, + { + "$ref": "#/texts/895" + }, + { + "$ref": "#/texts/896" + }, + { + "$ref": "#/texts/897" + }, + { + "$ref": "#/texts/898" + }, + { + "$ref": "#/texts/899" + }, + { + "$ref": "#/texts/900" + }, + { + "$ref": "#/texts/901" + }, + { + "$ref": "#/texts/902" + }, + { + "$ref": "#/texts/903" + }, + { + "$ref": "#/texts/904" + }, + { + "$ref": "#/texts/905" } - ] - } - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ], - "origin": { - "mimetype": "application/pdf", - "binary_hash": 14981478401387673002, - "filename": "2408.09869v3.pdf" - } - } - }, - { - "text": "annotated pages, from which we obtain accuracy ranges.", - "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", - "version": "1.0.0", - "doc_items": [ - { - "self_ref": "#/texts/697", - "parent": { - "$ref": "#/body" - }, - "children": [], + ], "content_layer": "body", - "label": "text", + "meta": {}, + "label": "picture", "prov": [ { "page_no": 9, "bbox": { - "l": 335.152, - "t": 564.097, - "r": 408.543, - "b": 561.395, + "l": 334.4932861328125, + "t": 558.5665130615234, + "r": 544.7938842773438, + "b": 414.31744384765625, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, - 54 + 0 ] } ] @@ -4893,6 +7679,249 @@ } } }, + { + "text": "In this image there is a table with some text on it.", + "meta": { + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", + "doc_items": [ + { + "self_ref": "#/pictures/12", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/915" + }, + { + "$ref": "#/texts/916" + }, + { + "$ref": "#/texts/917" + }, + { + "$ref": "#/texts/918" + }, + { + "$ref": "#/texts/919" + }, + { + "$ref": "#/texts/920" + }, + { + "$ref": "#/texts/921" + }, + { + "$ref": "#/texts/922" + }, + { + "$ref": "#/texts/923" + }, + { + "$ref": "#/texts/924" + }, + { + "$ref": "#/texts/925" + }, + { + "$ref": "#/texts/926" + }, + { + "$ref": "#/texts/927" + }, + { + "$ref": "#/texts/928" + }, + { + "$ref": "#/texts/929" + }, + { + "$ref": "#/texts/930" + }, + { + "$ref": "#/texts/931" + }, + { + "$ref": "#/texts/932" + }, + { + "$ref": "#/texts/933" + }, + { + "$ref": "#/texts/934" + }, + { + "$ref": "#/texts/935" + }, + { + "$ref": "#/texts/936" + }, + { + "$ref": "#/texts/937" + }, + { + "$ref": "#/texts/938" + }, + { + "$ref": "#/texts/939" + }, + { + "$ref": "#/texts/940" + }, + { + "$ref": "#/texts/941" + }, + { + "$ref": "#/texts/942" + }, + { + "$ref": "#/texts/943" + }, + { + "$ref": "#/texts/944" + }, + { + "$ref": "#/texts/945" + }, + { + "$ref": "#/texts/946" + }, + { + "$ref": "#/texts/947" + }, + { + "$ref": "#/texts/948" + }, + { + "$ref": "#/texts/949" + }, + { + "$ref": "#/texts/950" + }, + { + "$ref": "#/texts/951" + }, + { + "$ref": "#/texts/952" + }, + { + "$ref": "#/texts/953" + }, + { + "$ref": "#/texts/954" + }, + { + "$ref": "#/texts/955" + }, + { + "$ref": "#/texts/956" + }, + { + "$ref": "#/texts/957" + }, + { + "$ref": "#/texts/958" + }, + { + "$ref": "#/texts/959" + }, + { + "$ref": "#/texts/960" + }, + { + "$ref": "#/texts/961" + }, + { + "$ref": "#/texts/962" + }, + { + "$ref": "#/texts/963" + }, + { + "$ref": "#/texts/964" + }, + { + "$ref": "#/texts/965" + }, + { + "$ref": "#/texts/966" + }, + { + "$ref": "#/texts/967" + }, + { + "$ref": "#/texts/968" + }, + { + "$ref": "#/texts/969" + }, + { + "$ref": "#/texts/970" + }, + { + "$ref": "#/texts/971" + }, + { + "$ref": "#/texts/972" + }, + { + "$ref": "#/texts/973" + }, + { + "$ref": "#/texts/974" + }, + { + "$ref": "#/texts/975" + }, + { + "$ref": "#/texts/976" + }, + { + "$ref": "#/texts/977" + }, + { + "$ref": "#/texts/978" + }, + { + "$ref": "#/texts/979" + }, + { + "$ref": "#/texts/980" + } + ], + "content_layer": "body", + "meta": {}, + "label": "picture", + "prov": [ + { + "page_no": 9, + "bbox": { + "l": 108.79005432128906, + "t": 467.1181335449219, + "r": 329.1195068359375, + "b": 308.97198486328125, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ] + } + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ], + "origin": { + "mimetype": "application/pdf", + "binary_hash": 14981478401387673002, + "filename": "2408.09869v3.pdf" + } + } + }, { "text": "we distributed the annotation workload and performed continuous be annotated. We refrained from class labels that are very specific", "meta": { From 7e4c29e90c473a3542d9a86abbcc868031f6aa74 Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Wed, 29 Oct 2025 00:29:51 +0100 Subject: [PATCH 18/22] add allow- & block-lists for meta names, add std field name enum Signed-off-by: Panos Vagenas --- docling_core/transforms/serializer/common.py | 10 +- .../transforms/serializer/markdown.py | 13 ++- docling_core/types/doc/document.py | 36 +++++-- test/data/doc/group_with_metadata.yaml | 3 +- .../group_with_metadata_allowed_meta_names.md | 10 ++ .../group_with_metadata_blocked_meta_names.md | 10 ++ test/data/doc/group_with_metadata_default.md | 6 +- test/data/doc/group_with_metadata_marked.md | 6 +- .../group_with_metadata_without_non_meta.md | 15 +++ test/test_metadata.py | 95 ++++++++++++++----- 10 files changed, 162 insertions(+), 42 deletions(-) create mode 100644 test/data/doc/group_with_metadata_allowed_meta_names.md create mode 100644 test/data/doc/group_with_metadata_blocked_meta_names.md create mode 100644 test/data/doc/group_with_metadata_without_non_meta.md diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py index c8288b9d..4720ada0 100644 --- a/docling_core/transforms/serializer/common.py +++ b/docling_core/transforms/serializer/common.py @@ -208,6 +208,14 @@ class CommonParams(BaseModel): use_legacy_annotations: bool = Field( default=False, description="Use legacy annotation serialization." ) + allowed_meta_names: Optional[set[str]] = Field( + default=None, + description="Meta name to allow; None means all meta names are allowed.", + ) + blocked_meta_names: set[str] = Field( + default_factory=set, + description="Meta name to block; takes precedence over allowed_meta_names.", + ) def merge_with_patch(self, patch: dict[str, Any]) -> Self: """Create an instance by merging the provided patch dict on top of self.""" @@ -587,7 +595,7 @@ def serialize_meta( return self.meta_serializer.serialize( item=item, doc=self.doc, - **kwargs, + **(self.params.model_dump() | kwargs), ) else: return create_ser_result( diff --git a/docling_core/transforms/serializer/markdown.py b/docling_core/transforms/serializer/markdown.py index 5b9f3e5e..6292761d 100644 --- a/docling_core/transforms/serializer/markdown.py +++ b/docling_core/transforms/serializer/markdown.py @@ -109,7 +109,6 @@ class MarkdownParams(CommonParams): page_break_placeholder: Optional[str] = None # e.g. "" escape_underscores: bool = True escape_html: bool = True - # include_meta: bool = Field(default=True, description="Include item meta.") mark_meta: bool = Field(default=False, description="Mark meta sections.") include_annotations: bool = Field( default=True, @@ -284,8 +283,15 @@ def serialize( + list(item.meta.get_custom_part()) ) if ( - tmp := self._serialize_meta_field( - item.meta, key, params.mark_meta + ( + params.allowed_meta_names is None + or key in params.allowed_meta_names + ) + and (key not in params.blocked_meta_names) + and ( + tmp := self._serialize_meta_field( + item.meta, key, params.mark_meta + ) ) ) ] @@ -293,6 +299,7 @@ def serialize( else [] ), span_source=item if isinstance(item, DocItem) else [], + # NOTE for now using an empty span source for GroupItems ) def _serialize_meta_field( diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 2c4a7c80..117c80e0 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -1017,6 +1017,17 @@ class SummaryMetaField(BasePrediction): text: str +# NOTE: should be manually kept in sync with top-level BaseMeta hierarchy fields +class MetaFieldName(str, Enum): + """Standard meta field names.""" + + SUMMARY = "summary" + DESCRIPTION = "description" + CLASSIFICATION = "classification" + MOLECULE = "molecule" + TABULAR_CHART = "tabular_chart" + + class BaseMeta(_ExtraAllowingModel): """Base class for metadata.""" @@ -1600,7 +1611,7 @@ def _migrate_annotations_to_meta(cls, data: Any) -> Any: if isinstance(ann, PictureClassificationData): data["meta"].setdefault( - "classification", + MetaFieldName.CLASSIFICATION.value, PictureClassificationMetaField( predictions=[ PictureClassificationPrediction( @@ -1614,7 +1625,7 @@ def _migrate_annotations_to_meta(cls, data: Any) -> Any: ) elif isinstance(ann, DescriptionAnnotation): data["meta"].setdefault( - "description", + MetaFieldName.DESCRIPTION.value, DescriptionMetaField( text=ann.text, created_by=ann.provenance, @@ -1622,7 +1633,7 @@ def _migrate_annotations_to_meta(cls, data: Any) -> Any: ) elif isinstance(ann, PictureMoleculeData): data["meta"].setdefault( - "molecule", + MetaFieldName.MOLECULE.value, MoleculeMetaField( smi=ann.smi, confidence=ann.confidence, @@ -1639,7 +1650,7 @@ def _migrate_annotations_to_meta(cls, data: Any) -> Any: ) elif isinstance(ann, PictureTabularChartData): data["meta"].setdefault( - "tabular_chart", + MetaFieldName.TABULAR_CHART.value, TabularChartMetaField( title=ann.title, chart_data=ann.chart_data, @@ -1834,7 +1845,7 @@ def migrate_annotations_to_meta(cls, data: Any) -> Any: if isinstance(ann, DescriptionAnnotation): data["meta"].setdefault( - "description", + MetaFieldName.DESCRIPTION.value, DescriptionMetaField( text=ann.text, created_by=ann.provenance, @@ -4753,8 +4764,10 @@ def export_to_markdown( # noqa: C901 include_annotations: bool = True, mark_annotations: bool = False, *, - mark_meta: bool = False, use_legacy_annotations: bool = False, + allowed_meta_names: Optional[set[str]] = None, + blocked_meta_names: Optional[set[str]] = None, + mark_meta: bool = False, ) -> str: r"""Serialize to Markdown. @@ -4808,6 +4821,10 @@ def export_to_markdown( # noqa: C901 :type mark_meta: bool = False :returns: The exported Markdown representation. :rtype: str + :param allowed_meta_names: Optional[set[str]]: Meta names to allow; None means all meta names are allowed. + :type allowed_meta_names: Optional[set[str]] = None + :param blocked_meta_names: Optional[set[str]]: Meta names to block; takes precedence over allowed_meta_names. + :type blocked_meta_names: Optional[set[str]] = None """ from docling_core.transforms.serializer.markdown import ( MarkdownDocSerializer, @@ -4836,10 +4853,11 @@ def export_to_markdown( # noqa: C901 indent=indent, wrap_width=text_width if text_width > 0 else None, page_break_placeholder=page_break_placeholder, - # allowed_meta_names=set() if use_legacy_annotations else allowed_meta_names, - # blocked_meta_names=blocked_meta_names or set(), mark_meta=mark_meta, - include_annotations=include_annotations and use_legacy_annotations, + include_annotations=include_annotations, + use_legacy_annotations=use_legacy_annotations, + allowed_meta_names=allowed_meta_names, + blocked_meta_names=blocked_meta_names or set(), mark_annotations=mark_annotations, ), ) diff --git a/test/data/doc/group_with_metadata.yaml b/test/data/doc/group_with_metadata.yaml index b9353204..b88bcbc8 100644 --- a/test/data/doc/group_with_metadata.yaml +++ b/test/data/doc/group_with_metadata.yaml @@ -23,7 +23,6 @@ groups: content_layer: body label: chapter meta: - my_corp__test: value summary: text: This chapter discusses foo and bar. name: '1' @@ -36,6 +35,7 @@ groups: content_layer: body label: section meta: + my_corp__test_1: custom field value 1 summary: text: This section talks about foo. name: 1a @@ -59,6 +59,7 @@ groups: content_layer: body label: section meta: + my_corp__test_2: custom field value 2 summary: text: This section talks about bar. name: 1b diff --git a/test/data/doc/group_with_metadata_allowed_meta_names.md b/test/data/doc/group_with_metadata_allowed_meta_names.md new file mode 100644 index 00000000..af1bad3d --- /dev/null +++ b/test/data/doc/group_with_metadata_allowed_meta_names.md @@ -0,0 +1,10 @@ +This is some introductory text. + +[My Corp Test 1] custom field value 1 + +Regarding foo... + +1. lorem +2. ipsum + +Regarding bar... \ No newline at end of file diff --git a/test/data/doc/group_with_metadata_blocked_meta_names.md b/test/data/doc/group_with_metadata_blocked_meta_names.md new file mode 100644 index 00000000..2662ad01 --- /dev/null +++ b/test/data/doc/group_with_metadata_blocked_meta_names.md @@ -0,0 +1,10 @@ +This is some introductory text. + +Regarding foo... + +1. lorem +2. ipsum + +[My Corp Test 2] custom field value 2 + +Regarding bar... \ No newline at end of file diff --git a/test/data/doc/group_with_metadata_default.md b/test/data/doc/group_with_metadata_default.md index 157acfa4..f018e510 100644 --- a/test/data/doc/group_with_metadata_default.md +++ b/test/data/doc/group_with_metadata_default.md @@ -2,12 +2,12 @@ This document talks about various topics. This chapter discusses foo and bar. -value - This is some introductory text. This section talks about foo. +custom field value 1 + This paragraph provides more details about foo. Regarding foo... @@ -19,4 +19,6 @@ Here some foo specifics are listed. This section talks about bar. +custom field value 2 + Regarding bar... \ No newline at end of file diff --git a/test/data/doc/group_with_metadata_marked.md b/test/data/doc/group_with_metadata_marked.md index 5393569e..e73eaf0f 100644 --- a/test/data/doc/group_with_metadata_marked.md +++ b/test/data/doc/group_with_metadata_marked.md @@ -2,12 +2,12 @@ [Summary] This chapter discusses foo and bar. -[My Corp Test] value - This is some introductory text. [Summary] This section talks about foo. +[My Corp Test 1] custom field value 1 + [Summary] This paragraph provides more details about foo. Regarding foo... @@ -19,4 +19,6 @@ Regarding foo... [Summary] This section talks about bar. +[My Corp Test 2] custom field value 2 + Regarding bar... \ No newline at end of file diff --git a/test/data/doc/group_with_metadata_without_non_meta.md b/test/data/doc/group_with_metadata_without_non_meta.md new file mode 100644 index 00000000..f8bdd082 --- /dev/null +++ b/test/data/doc/group_with_metadata_without_non_meta.md @@ -0,0 +1,15 @@ +[Summary] This document talks about various topics. + +[Summary] This chapter discusses foo and bar. + +[Summary] This section talks about foo. + +[My Corp Test 1] custom field value 1 + +[Summary] This paragraph provides more details about foo. + +[Summary] Here some foo specifics are listed. + +[Summary] This section talks about bar. + +[My Corp Test 2] custom field value 2 \ No newline at end of file diff --git a/test/test_metadata.py b/test/test_metadata.py index 448ca327..aff10db7 100644 --- a/test/test_metadata.py +++ b/test/test_metadata.py @@ -16,9 +16,11 @@ BaseMeta, DocItem, DoclingDocument, + MetaFieldName, NodeItem, RefItem, SummaryMetaField, + create_meta_field_name, ) from docling_core.types.doc.labels import DocItemLabel, GroupLabel @@ -89,7 +91,9 @@ def _create_doc_with_group_with_metadata() -> DoclingDocument: grp1a.meta = BaseMeta( summary=SummaryMetaField(text="This section talks about foo.") ) - grp1.meta.set_custom_field(namespace="my_corp", name="test", value="value") + grp1a.meta.set_custom_field( + namespace="my_corp", name="test_1", value="custom field value 1" + ) txt1 = doc.add_text(text="Regarding foo...", label=DocItemLabel.TEXT, parent=grp1a) txt1.meta = BaseMeta( summary=SummaryMetaField(text="This paragraph provides more details about foo.") @@ -105,6 +109,9 @@ def _create_doc_with_group_with_metadata() -> DoclingDocument: grp1b.meta = BaseMeta( summary=SummaryMetaField(text="This section talks about bar.") ) + grp1b.meta.set_custom_field( + namespace="my_corp", name="test_2", value="custom field value 2" + ) doc.add_text(text="Regarding bar...", label=DocItemLabel.TEXT, parent=grp1b) return doc @@ -126,9 +133,7 @@ def test_md_ser_default(): doc = _create_doc_with_group_with_metadata() # test exporting to Markdown - params = MarkdownParams( - include_annotations=False, - ) + params = MarkdownParams() ser = MarkdownDocSerializer(doc=doc, params=params) ser_res = ser.serialize() actual = ser_res.text @@ -147,7 +152,6 @@ def test_md_ser_marked(): # test exporting to Markdown params = MarkdownParams( - include_annotations=False, mark_meta=True, ) ser = MarkdownDocSerializer(doc=doc, params=params) @@ -163,6 +167,68 @@ def test_md_ser_marked(): assert actual == expected +def test_md_ser_allowed_meta_names(): + doc = _create_doc_with_group_with_metadata() + params = MarkdownParams( + allowed_meta_names={ + create_meta_field_name(namespace="my_corp", name="test_1"), + }, + mark_meta=True, + ) + ser = MarkdownDocSerializer(doc=doc, params=params) + ser_res = ser.serialize() + actual = ser_res.text + exp_file = Path("test/data/doc/group_with_metadata_allowed_meta_names.md") + if GEN_TEST_DATA: + with open(exp_file, "w", encoding="utf-8") as f: + f.write(actual) + else: + with open(exp_file, "r", encoding="utf-8") as f: + expected = f.read() + assert actual == expected + + +def test_md_ser_blocked_meta_names(): + doc = _create_doc_with_group_with_metadata() + params = MarkdownParams( + blocked_meta_names={ + create_meta_field_name(namespace="my_corp", name="test_1"), + MetaFieldName.SUMMARY.value, + }, + mark_meta=True, + ) + ser = MarkdownDocSerializer(doc=doc, params=params) + ser_res = ser.serialize() + actual = ser_res.text + exp_file = Path("test/data/doc/group_with_metadata_blocked_meta_names.md") + if GEN_TEST_DATA: + with open(exp_file, "w", encoding="utf-8") as f: + f.write(actual) + else: + with open(exp_file, "r", encoding="utf-8") as f: + expected = f.read() + assert actual == expected + + +def test_md_ser_without_non_meta(): + doc = _create_doc_with_group_with_metadata() + params = MarkdownParams( + include_non_meta=False, + mark_meta=True, + ) + ser = MarkdownDocSerializer(doc=doc, params=params) + ser_res = ser.serialize() + actual = ser_res.text + exp_file = Path("test/data/doc/group_with_metadata_without_non_meta.md") + if GEN_TEST_DATA: + with open(exp_file, "w", encoding="utf-8") as f: + f.write(actual) + else: + with open(exp_file, "r", encoding="utf-8") as f: + expected = f.read() + assert actual == expected + + def test_ser_custom_meta_serializer(): class SummaryMarkdownMetaSerializer(MarkdownMetaSerializer): @@ -213,29 +279,10 @@ def _serialize_meta_field( else: return None - # class SummaryMarkdownDocSerializer(MarkdownDocSerializer): - # # just for overriding the delimiter to single newline: - # @override - # def serialize_doc( - # self, - # *, - # parts: list[SerializationResult], - # **kwargs: Any, - # ) -> SerializationResult: - # """Serialize a document out of its parts.""" - # text_res = "\n".join([p.text for p in parts if p.text]) - # if self.requires_page_break(): - # page_sep = self.params.page_break_placeholder or "" - # for full_match, _, _ in self._get_page_breaks(text=text_res): - # text_res = text_res.replace(full_match, page_sep) - - # return create_ser_result(text=text_res, span_source=parts) - doc = _create_doc_with_group_with_metadata() # test exporting to Markdown params = MarkdownParams( - include_annotations=False, include_non_meta=False, ) ser = MarkdownDocSerializer( From d8b7cc5ad4e629c1df72c89d12a16fef7c8d8cf8 Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Wed, 29 Oct 2025 16:25:26 +0100 Subject: [PATCH 19/22] add HTML serializer, document meta field names, rename SMILES field Signed-off-by: Panos Vagenas --- docling_core/transforms/serializer/html.py | 81 ++++++++++++++++++- .../transforms/serializer/markdown.py | 2 +- docling_core/types/doc/document.py | 14 ++-- docs/DoclingDocument.json | 6 +- ...iched_p1_include_annotations_false.gt.html | 1 + ...riched_p1_include_annotations_true.gt.html | 2 +- .../2408.09869v3_enriched_p2_p3_p5.gt.html | 4 +- .../doc/2408.09869v3_enriched_split.gt.html | 46 +++++++---- test/data/doc/barchart.gt.html | 3 +- test/data/doc/dummy_doc.yaml.html | 9 ++- test/data/doc/dummy_doc_2_prec.yaml | 2 +- .../doc/dummy_doc_with_meta_modified.yaml | 2 +- 12 files changed, 135 insertions(+), 37 deletions(-) diff --git a/docling_core/transforms/serializer/html.py b/docling_core/transforms/serializer/html.py index 98e5cf7d..aad194c1 100644 --- a/docling_core/transforms/serializer/html.py +++ b/docling_core/transforms/serializer/html.py @@ -17,7 +17,7 @@ import latex2mathml.converter from PIL.Image import Image -from pydantic import AnyUrl, BaseModel +from pydantic import AnyUrl, BaseModel, Field from typing_extensions import override from docling_core.transforms.serializer.base import ( @@ -28,6 +28,7 @@ BaseInlineSerializer, BaseKeyValueSerializer, BaseListSerializer, + BaseMetaSerializer, BasePictureSerializer, BaseTableSerializer, BaseTextSerializer, @@ -46,9 +47,11 @@ from docling_core.transforms.visualizer.base import BaseVisualizer from docling_core.types.doc.base import ImageRefMode from docling_core.types.doc.document import ( + BaseMeta, CodeItem, ContentLayer, DescriptionAnnotation, + DescriptionMetaField, DocItem, DoclingDocument, FloatingItem, @@ -61,14 +64,18 @@ KeyValueItem, ListGroup, ListItem, + MoleculeMetaField, NodeItem, PictureClassificationData, + PictureClassificationMetaField, PictureItem, PictureMoleculeData, PictureTabularChartData, RichTableCell, SectionHeaderItem, + SummaryMetaField, TableItem, + TabularChartMetaField, TextItem, TitleItem, ) @@ -115,7 +122,11 @@ class HTMLParams(CommonParams): # Enable charts to be printed into HTML as tables enable_chart_tables: bool = True - include_annotations: bool = True + include_annotations: bool = Field( + default=True, + description="Include item annotations.", + deprecated="Use include_meta instead.", + ) show_original_list_item_marker: bool = True @@ -808,6 +819,65 @@ def serialize( ) +class HTMLMetaSerializer(BaseModel, BaseMetaSerializer): + """HTML-specific meta serializer.""" + + @override + def serialize( + self, + *, + item: NodeItem, + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + """Serialize the item's meta.""" + params = HTMLParams(**kwargs) + return create_ser_result( + text="\n".join( + [ + tmp + for key in ( + list(item.meta.__class__.model_fields) + + list(item.meta.get_custom_part()) + ) + if ( + ( + params.allowed_meta_names is None + or key in params.allowed_meta_names + ) + and (key not in params.blocked_meta_names) + and (tmp := self._serialize_meta_field(item.meta, key)) + ) + ] + if item.meta + else [] + ), + span_source=item if isinstance(item, DocItem) else [], + # NOTE for now using an empty span source for GroupItems + ) + + def _serialize_meta_field(self, meta: BaseMeta, name: str) -> Optional[str]: + if (field_val := getattr(meta, name)) is not None: + if isinstance(field_val, SummaryMetaField): + txt = field_val.text + elif isinstance(field_val, DescriptionMetaField): + txt = field_val.text + elif isinstance(field_val, PictureClassificationMetaField): + txt = self._humanize_text(field_val.get_main_prediction().class_name) + elif isinstance(field_val, MoleculeMetaField): + txt = field_val.smiles + elif isinstance(field_val, TabularChartMetaField): + # suppressing tabular chart serialization + return None + elif tmp := str(field_val or ""): + txt = tmp + else: + return None + return f"
{txt}
" + else: + return None + + class HTMLAnnotationSerializer(BaseModel, BaseAnnotationSerializer): """HTML-specific annotation serializer.""" @@ -858,6 +928,7 @@ class HTMLDocSerializer(DocSerializer): list_serializer: BaseListSerializer = HTMLListSerializer() inline_serializer: BaseInlineSerializer = HTMLInlineSerializer() + meta_serializer: BaseMetaSerializer = HTMLMetaSerializer() annotation_serializer: BaseAnnotationSerializer = HTMLAnnotationSerializer() params: HTMLParams = HTMLParams() @@ -1047,7 +1118,11 @@ def serialize_captions( ) results.append(cap_ser_res) - if params.include_annotations and item.self_ref not in excluded_refs: + if ( + params.use_legacy_annotations + and params.include_annotations + and item.self_ref not in excluded_refs + ): if isinstance(item, (PictureItem, TableItem)): ann_res = self.serialize_annotations( item=item, diff --git a/docling_core/transforms/serializer/markdown.py b/docling_core/transforms/serializer/markdown.py index 6292761d..f82ff9ff 100644 --- a/docling_core/transforms/serializer/markdown.py +++ b/docling_core/transforms/serializer/markdown.py @@ -313,7 +313,7 @@ def _serialize_meta_field( elif isinstance(field_val, PictureClassificationMetaField): txt = self._humanize_text(field_val.get_main_prediction().class_name) elif isinstance(field_val, MoleculeMetaField): - txt = field_val.smi + txt = field_val.smiles elif isinstance(field_val, TabularChartMetaField): # suppressing tabular chart serialization return None diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 117c80e0..3d4e5537 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -1021,11 +1021,11 @@ class SummaryMetaField(BasePrediction): class MetaFieldName(str, Enum): """Standard meta field names.""" - SUMMARY = "summary" - DESCRIPTION = "description" - CLASSIFICATION = "classification" - MOLECULE = "molecule" - TABULAR_CHART = "tabular_chart" + SUMMARY = "summary" # a summary of the tree under this node + DESCRIPTION = "description" # a description of the node (e.g. for images) + CLASSIFICATION = "classification" # a classification of the node content + MOLECULE = "molecule" # molecule data + TABULAR_CHART = "tabular_chart" # tabular chart data class BaseMeta(_ExtraAllowingModel): @@ -1069,7 +1069,7 @@ def get_main_prediction(self) -> PictureClassificationPrediction: class MoleculeMetaField(BasePrediction): """Molecule metadata field.""" - smi: str = Field(description="The SMILES representation of the molecule.") + smiles: str = Field(description="The SMILES representation of the molecule.") class TabularChartMetaField(BasePrediction): @@ -1635,7 +1635,7 @@ def _migrate_annotations_to_meta(cls, data: Any) -> Any: data["meta"].setdefault( MetaFieldName.MOLECULE.value, MoleculeMetaField( - smi=ann.smi, + smiles=ann.smi, confidence=ann.confidence, created_by=ann.provenance, **{ diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index f1c957cf..060c0df3 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -1421,14 +1421,14 @@ ], "title": "Created By" }, - "smi": { + "smiles": { "description": "The SMILES representation of the molecule.", - "title": "Smi", + "title": "Smiles", "type": "string" } }, "required": [ - "smi" + "smiles" ], "title": "MoleculeMetaField", "type": "object" diff --git a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.html b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.html index 3e166869..7f10d0ac 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.html +++ b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.html @@ -4,6 +4,7 @@

Docling Technical Report

+
In this image we can see a cartoon image of a duck holding a paper.

Version 1.0

Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar

AI4K Group, IBM Research R¨ uschlikon, Switzerland

diff --git a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_true.gt.html b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_true.gt.html index 0bb79d05..7f10d0ac 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_true.gt.html +++ b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_true.gt.html @@ -4,7 +4,7 @@

Docling Technical Report

-
In this image we can see a cartoon image of a duck holding a paper.
+
In this image we can see a cartoon image of a duck holding a paper.

Version 1.0

Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar

AI4K Group, IBM Research R¨ uschlikon, Switzerland

diff --git a/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.html b/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.html index f728cdb3..00bf0385 100644 --- a/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.html +++ b/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.html @@ -126,7 +126,8 @@

3.1 PDF backends

-
Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.
In this image, we can see some text and images.
+
In this image, we can see some text and images.
+
Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.

licensing (e.g. pymupdf [7]), poor speed or unrecoverable quality issues, such as merged text cells across far-apart text tokens or table columns (pypdfium, PyPDF) [15, 14].

We therefore decided to provide multiple backend choices, and additionally open-source a custombuilt PDF parser, which is based on the low-level qpdf [4] library. It is made available in a separate package named docling-parse and powers the default PDF backend in Docling. As an alternative, we provide a PDF backend relying on pypdfium , which may be a safe backup choice in certain cases, e.g. if issues are seen with particular font encodings.

3.2 AI models

@@ -148,6 +149,7 @@

Table Structure Recognition

torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report.

+
{'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'}
Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads.
CPUThread budgetnative backendpypdfium backend
TTSPages/sMemTTSPages/sMem
Apple M3 Max4177 s 167 s1.27 1.346.20 GB103 s 92 s2.18 2.452.56 GB
(16 cores) Intel(R) Xeon E5-269016 4 16375 s 244 s0.60 0.926.16 GB239 s 143 s0.94 1.572.42 GB

5 Applications

Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets.

diff --git a/test/data/doc/2408.09869v3_enriched_split.gt.html b/test/data/doc/2408.09869v3_enriched_split.gt.html index 33c39bc5..1adaa3d9 100644 --- a/test/data/doc/2408.09869v3_enriched_split.gt.html +++ b/test/data/doc/2408.09869v3_enriched_split.gt.html @@ -96,7 +96,8 @@

Docling Technical Report

-
In this image we can see a cartoon image of a duck holding a paper.
+
In this image we can see a cartoon image of a duck holding a paper.
+

Version 1.0

Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar

AI4K Group, IBM Research R¨ uschlikon, Switzerland

@@ -147,7 +148,8 @@

3.1 PDF backends

-
Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.
In this image, we can see some text and images.
+
In this image, we can see some text and images.
+
Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.

licensing (e.g. pymupdf [7]), poor speed or unrecoverable quality issues, such as merged text cells across far-apart text tokens or table columns (pypdfium, PyPDF) [15, 14].

We therefore decided to provide multiple backend choices, and additionally open-source a custombuilt PDF parser, which is based on the low-level qpdf [4] library. It is made available in a separate package named docling-parse and powers the default PDF backend in Docling. As an alternative, we provide a PDF backend relying on pypdfium , which may be a safe backup choice in certain cases, e.g. if issues are seen with particular font encodings.

3.2 AI models

@@ -193,6 +195,7 @@

4 Performance

torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report.

+
{'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'}
Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads.
CPUThread budgetnative backendpypdfium backend
TTSPages/sMemTTSPages/sMem
Apple M3 Max4177 s 167 s1.27 1.346.20 GB103 s 92 s2.18 2.452.56 GB
(16 cores) Intel(R) Xeon E5-269016 4 16375 s 244 s0.60 0.926.16 GB239 s 143 s0.94 1.572.42 GB

5 Applications

Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets.

@@ -273,12 +276,16 @@

KEYWORDS

PDF document conversion, layout segmentation, object-detection, data set, Machine Learning

ACM Reference Format:

Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043

-
In this image there is a table with some text on it.
-
In this image we can see a text.
+
In this image there is a table with some text on it.
+
+
In this image we can see a text.
+

AGL Energy Limited ABN 74 1

5 061 375

-
In this image I can see the cover of the book.
-
In this image there is a paper with some text on it.
+
In this image I can see the cover of the book.
+
+
In this image there is a paper with some text on it.
+

Figure 1: Four examples of complex page layouts across different document categories

KEYWORDS

PDF document conversion, layout segmentation, object-detection, data set, Machine Learning

@@ -303,11 +310,12 @@

ACMReference Format:

to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity.

5 EXPERIMENTS

The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this

-
In this image, we can see a table with some text.
+
In this image, we can see a table with some text.
+

Third, achienec

EXPERIMENTS

chalenongayouls ground-vuth dawa such WC

-
The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. +
The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. @@ -315,7 +323,8 @@

EXPERIMENTS

#### Training Program: - **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. -- **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%.
+- **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%.
+

Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.

paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.

In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16].

@@ -341,18 +350,22 @@

Baselines for Object Detection

Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as %

between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.

of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric

-
The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. +
The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. -The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. +The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. -The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A"
-
In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318.
+The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A"
+
+
In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318.
+
class labelCount% of Totaltriple inter-annotator mAP @ 0.5-0.95 (%)
TrainTestValAllFinManSciLawPatTen
Caption225242.041.772.3284-8940-6186-9294-9995-9969-78n/a
Footnote63180.600.310.5883-91n/a10062-8885-94n/a82-97
Formula250272.251.902.9683-85n/an/a84-8786-96n/an/a
List-item18566017.1913.3415.8287-8874-8390-9297-9781-8575-8893-95
Page-footer708786.515.586.0093-9488-9095-9610092-9710096-98
Page-header580225.106.705.0685-8966-7690-9498-10091-9297-9981-86
Picture459764.212.785.3169-7156-5982-8669-8280-9566-7159-76
Section-header14288412.6015.7712.8583-8476-8190-9294-9587-9469-7378-86
Table347333.202.273.6077-8175-8083-8698-9958-8079-8470-85
Text51037745.8249.2845.0084-8681-8688-9389-9387-9271-7987-95
Title50710.470.300.5060-7224-6350-6394-10082-9668-7924-56
Total1107470941123998166653182-8371-7479-8189-9486-9171-7668-85
-
In this image I can see a blue circle.
+
In this image I can see a blue circle.
+

include publication repositories such as arXiv

Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-

annotated pages, from which we obtain accuracy ranges.

-
A table with different columns and rows.
+
A table with different columns and rows.
+
% of Total% of Total% of Totaltriple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)
class labelCountTrainTestValAllFinManSciLawPatTen
Caption225242.041.772.3284-8940-6186-9294-9995-9969-78n/a
Footnote63180.600.310.5883-91n/a10062-8885-94n/a82-97
Formula250272.251.902.9683-85n/an/a84-8786-96n/an/a
List-item18566017.1913.3415.8287-8874-8390-9297-9781-8575-8893-95
Page- footer708786.515.586.0093-9488-9095-9610092-9710096-98
Page- header580225.106.705.0685-8966-7690-9498-10091-9297-9981-86
Picture459764.212.785.3169-7156-5982-8669-8280-9566-7159-76
Section- header14288412.6015.7712.8583-8476-8190-9294-9587-9469-7378-86
Table347333.202.273.6077-8175-8083-8698-9958-8079-8470-85
Text51037745.8249.2845.0084-8681-8688-9389-9387-9271-7987-95
Title50710.470.300.5060-7224-6350-6394-10082-9668-7924-56
Total1107470941123998166653182-8371-7479-8189-9486-9171-7668-85

3

,

@@ -363,7 +376,8 @@

Baselines for Object Detection

Title and

.

page. Specificity ensures that the choice of label is not ambiguous,

-
In this image there is a text in the middle.
+
In this image there is a text in the middle.
+

we distributed the annotation workload and performed continuous be annotated. We refrained from class labels that are very specific

only. For phases three and four, a group of 40 dedicated annotators while coverage ensures that all meaningful items on a page can

quality controls. Phase one and two required a small team of experts to a document category, such as

diff --git a/test/data/doc/barchart.gt.html b/test/data/doc/barchart.gt.html index 6ee917ef..05cbb81f 100644 --- a/test/data/doc/barchart.gt.html +++ b/test/data/doc/barchart.gt.html @@ -124,7 +124,8 @@
-
bar chart
Number of impellerssingle-frequencymulti-frequency
10.060.16
20.120.26
30.160.27
40.140.26
50.160.25
60.240.24
+
Bar chart
+
Number of impellerssingle-frequencymulti-frequency
10.060.16
20.120.26
30.160.27
40.140.26
50.160.25
60.240.24
diff --git a/test/data/doc/dummy_doc.yaml.html b/test/data/doc/dummy_doc.yaml.html index 0e9e69d5..1273bff5 100644 --- a/test/data/doc/dummy_doc.yaml.html +++ b/test/data/doc/dummy_doc.yaml.html @@ -125,8 +125,13 @@

DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis

-
Figure 1: Four examples of complex page layouts across different document categories
bar chart
...
CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1
-
A description annotation for this table.
+
...
+
Bar chart
+
CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1
+
{'myanalysis': {'prediction': 'abc'}, 'something_else': {'text': 'aaa'}}
+
Figure 1: Four examples of complex page layouts across different document categories
+
A description annotation for this table.
+
{'foo': 'bar'}
diff --git a/test/data/doc/dummy_doc_2_prec.yaml b/test/data/doc/dummy_doc_2_prec.yaml index 31fa5fdd..814512f9 100644 --- a/test/data/doc/dummy_doc_2_prec.yaml +++ b/test/data/doc/dummy_doc_2_prec.yaml @@ -107,7 +107,7 @@ pictures: - 1.0 - - 1.0 - 1.0 - smi: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 + smiles: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 parent: $ref: '#/body' prov: diff --git a/test/data/doc/dummy_doc_with_meta_modified.yaml b/test/data/doc/dummy_doc_with_meta_modified.yaml index a58a566b..bc5e5845 100644 --- a/test/data/doc/dummy_doc_with_meta_modified.yaml +++ b/test/data/doc/dummy_doc_with_meta_modified.yaml @@ -107,7 +107,7 @@ pictures: - 1.0 - - 1.0 - 1.0 - smi: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 + smiles: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 parent: $ref: '#/body' prov: From dafb5844a088d94ea99bcc9215ca1fdb79821f20 Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Wed, 29 Oct 2025 16:32:23 +0100 Subject: [PATCH 20/22] bump DoclingDocument version Signed-off-by: Panos Vagenas --- docling_core/types/doc/document.py | 2 +- docs/DoclingDocument.json | 2 +- test/data/doc/2206.01062.yaml.dt.json | 2 +- test/data/doc/2408.09869v3_enriched.out.dt.json | 2 +- test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json | 2 +- test/data/doc/concatenated.json | 2 +- test/data/doc/constructed_doc.added_extracted_doc.json.gt | 2 +- test/data/doc/constructed_doc.appended_child.json.gt | 2 +- test/data/doc/constructed_doc.bulk_item_addition.json.gt | 2 +- test/data/doc/constructed_doc.bulk_item_insertion.json.gt | 2 +- test/data/doc/constructed_doc.deleted_group.json.gt | 2 +- test/data/doc/constructed_doc.deleted_items_range.json.gt | 2 +- test/data/doc/constructed_doc.deleted_picture.json.gt | 2 +- test/data/doc/constructed_doc.deleted_text.json.gt | 2 +- test/data/doc/constructed_doc.embedded.json.gt | 2 +- test/data/doc/constructed_doc.embedded.yaml.gt | 2 +- test/data/doc/constructed_doc.extracted_with_deletion.json.gt | 2 +- test/data/doc/constructed_doc.inserted_extracted_doc.json.gt | 2 +- .../doc/constructed_doc.inserted_items_with_insert_*.json.gt | 2 +- .../constructed_doc.inserted_list_items_with_insert_*.json.gt | 2 +- test/data/doc/constructed_doc.inserted_text.json.gt | 2 +- test/data/doc/constructed_doc.manipulated_table.json.gt | 2 +- test/data/doc/constructed_doc.referenced.json.gt | 2 +- test/data/doc/constructed_doc.referenced.yaml.gt | 2 +- test/data/doc/constructed_doc.replaced_item.json.gt | 2 +- test/data/doc/doc_with_kv.dt.json | 2 +- test/data/doc/dummy_doc_2_prec.yaml | 2 +- test/data/doc/dummy_doc_with_meta_modified.yaml | 2 +- test/data/doc/group_with_metadata.yaml | 2 +- test/data/doc/misplaced_list_items.norm.out.yaml | 2 +- test/data/doc/misplaced_list_items.out.yaml | 2 +- test/data/doc/page_with_pic.dt.json | 2 +- test/data/doc/page_with_pic_from_files.dt.json | 2 +- test/data/doc/page_without_pic.dt.json | 2 +- test/data/doc/rich_table.out.yaml | 2 +- test/data/doc/rich_table_item_ins_norm_1.out.yaml | 2 +- test/data/doc/rich_table_item_ins_norm_2.out.yaml | 2 +- test/data/doc/rich_table_item_ins_norm_3.out.yaml | 2 +- test/data/doc/rich_table_post_text_del.out.yaml | 2 +- test/data/legacy_doc/doc-export.docling.yaml.gt | 2 +- 40 files changed, 40 insertions(+), 40 deletions(-) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 3d4e5537..861e48a4 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -62,7 +62,7 @@ Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))] LevelNumber = typing.Annotated[int, Field(ge=1, le=100)] -CURRENT_VERSION: Final = "1.7.0" +CURRENT_VERSION: Final = "1.8.0" DEFAULT_EXPORT_LABELS = { DocItemLabel.TITLE, diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index 060c0df3..9c8d5939 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -2883,7 +2883,7 @@ "type": "string" }, "version": { - "default": "1.7.0", + "default": "1.8.0", "pattern": "^(?P0|[1-9]\\d*)\\.(?P0|[1-9]\\d*)\\.(?P0|[1-9]\\d*)(?:-(?P(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+(?P[0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$", "title": "Version", "type": "string" diff --git a/test/data/doc/2206.01062.yaml.dt.json b/test/data/doc/2206.01062.yaml.dt.json index f954386b..83dd30ce 100644 --- a/test/data/doc/2206.01062.yaml.dt.json +++ b/test/data/doc/2206.01062.yaml.dt.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Document", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/2408.09869v3_enriched.out.dt.json b/test/data/doc/2408.09869v3_enriched.out.dt.json index 7d3e159a..fec32692 100644 --- a/test/data/doc/2408.09869v3_enriched.out.dt.json +++ b/test/data/doc/2408.09869v3_enriched.out.dt.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Document", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json b/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json index 59d525d3..7a501c17 100644 --- a/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json +++ b/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "2408.09869v3", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/concatenated.json b/test/data/doc/concatenated.json index 47fe4990..e48e9d61 100644 --- a/test/data/doc/concatenated.json +++ b/test/data/doc/concatenated.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "2501.17887v1 + Untitled 1 + 2311.18481v1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.added_extracted_doc.json.gt b/test/data/doc/constructed_doc.added_extracted_doc.json.gt index 4013747b..ed878b6a 100644 --- a/test/data/doc/constructed_doc.added_extracted_doc.json.gt +++ b/test/data/doc/constructed_doc.added_extracted_doc.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.appended_child.json.gt b/test/data/doc/constructed_doc.appended_child.json.gt index 74b6fba7..d3e30ed0 100644 --- a/test/data/doc/constructed_doc.appended_child.json.gt +++ b/test/data/doc/constructed_doc.appended_child.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.bulk_item_addition.json.gt b/test/data/doc/constructed_doc.bulk_item_addition.json.gt index 257c5b90..a4379c8f 100644 --- a/test/data/doc/constructed_doc.bulk_item_addition.json.gt +++ b/test/data/doc/constructed_doc.bulk_item_addition.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.bulk_item_insertion.json.gt b/test/data/doc/constructed_doc.bulk_item_insertion.json.gt index ce4f7c6d..398c5c62 100644 --- a/test/data/doc/constructed_doc.bulk_item_insertion.json.gt +++ b/test/data/doc/constructed_doc.bulk_item_insertion.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.deleted_group.json.gt b/test/data/doc/constructed_doc.deleted_group.json.gt index 549ae6a0..5cea9068 100644 --- a/test/data/doc/constructed_doc.deleted_group.json.gt +++ b/test/data/doc/constructed_doc.deleted_group.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.deleted_items_range.json.gt b/test/data/doc/constructed_doc.deleted_items_range.json.gt index 91b37357..12ed02c5 100644 --- a/test/data/doc/constructed_doc.deleted_items_range.json.gt +++ b/test/data/doc/constructed_doc.deleted_items_range.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.deleted_picture.json.gt b/test/data/doc/constructed_doc.deleted_picture.json.gt index 85890f23..bfcdd153 100644 --- a/test/data/doc/constructed_doc.deleted_picture.json.gt +++ b/test/data/doc/constructed_doc.deleted_picture.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.deleted_text.json.gt b/test/data/doc/constructed_doc.deleted_text.json.gt index 45c03c2a..62d866c5 100644 --- a/test/data/doc/constructed_doc.deleted_text.json.gt +++ b/test/data/doc/constructed_doc.deleted_text.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.embedded.json.gt b/test/data/doc/constructed_doc.embedded.json.gt index 4ac0e019..8fff1a5b 100644 --- a/test/data/doc/constructed_doc.embedded.json.gt +++ b/test/data/doc/constructed_doc.embedded.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.embedded.yaml.gt b/test/data/doc/constructed_doc.embedded.yaml.gt index 15d93ce3..d2d171d2 100644 --- a/test/data/doc/constructed_doc.embedded.yaml.gt +++ b/test/data/doc/constructed_doc.embedded.yaml.gt @@ -1113,4 +1113,4 @@ texts: prov: [] self_ref: '#/texts/55' text: The end. -version: 1.7.0 +version: 1.8.0 diff --git a/test/data/doc/constructed_doc.extracted_with_deletion.json.gt b/test/data/doc/constructed_doc.extracted_with_deletion.json.gt index fc7a3b94..1712938c 100644 --- a/test/data/doc/constructed_doc.extracted_with_deletion.json.gt +++ b/test/data/doc/constructed_doc.extracted_with_deletion.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.inserted_extracted_doc.json.gt b/test/data/doc/constructed_doc.inserted_extracted_doc.json.gt index a31af507..68b1dceb 100644 --- a/test/data/doc/constructed_doc.inserted_extracted_doc.json.gt +++ b/test/data/doc/constructed_doc.inserted_extracted_doc.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.inserted_items_with_insert_*.json.gt b/test/data/doc/constructed_doc.inserted_items_with_insert_*.json.gt index 2722426c..7144b362 100644 --- a/test/data/doc/constructed_doc.inserted_items_with_insert_*.json.gt +++ b/test/data/doc/constructed_doc.inserted_items_with_insert_*.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.inserted_list_items_with_insert_*.json.gt b/test/data/doc/constructed_doc.inserted_list_items_with_insert_*.json.gt index 42044db6..ded60ef6 100644 --- a/test/data/doc/constructed_doc.inserted_list_items_with_insert_*.json.gt +++ b/test/data/doc/constructed_doc.inserted_list_items_with_insert_*.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.inserted_text.json.gt b/test/data/doc/constructed_doc.inserted_text.json.gt index 6c4285f4..abff4454 100644 --- a/test/data/doc/constructed_doc.inserted_text.json.gt +++ b/test/data/doc/constructed_doc.inserted_text.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.manipulated_table.json.gt b/test/data/doc/constructed_doc.manipulated_table.json.gt index e65dd7d8..66223131 100644 --- a/test/data/doc/constructed_doc.manipulated_table.json.gt +++ b/test/data/doc/constructed_doc.manipulated_table.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.referenced.json.gt b/test/data/doc/constructed_doc.referenced.json.gt index 8a11418f..da939109 100644 --- a/test/data/doc/constructed_doc.referenced.json.gt +++ b/test/data/doc/constructed_doc.referenced.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.referenced.yaml.gt b/test/data/doc/constructed_doc.referenced.yaml.gt index bb291c11..2a2355a2 100644 --- a/test/data/doc/constructed_doc.referenced.yaml.gt +++ b/test/data/doc/constructed_doc.referenced.yaml.gt @@ -1113,4 +1113,4 @@ texts: prov: [] self_ref: '#/texts/55' text: The end. -version: 1.7.0 +version: 1.8.0 diff --git a/test/data/doc/constructed_doc.replaced_item.json.gt b/test/data/doc/constructed_doc.replaced_item.json.gt index 91b37357..12ed02c5 100644 --- a/test/data/doc/constructed_doc.replaced_item.json.gt +++ b/test/data/doc/constructed_doc.replaced_item.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/doc_with_kv.dt.json b/test/data/doc/doc_with_kv.dt.json index d59bf29a..c56dbd34 100644 --- a/test/data/doc/doc_with_kv.dt.json +++ b/test/data/doc/doc_with_kv.dt.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Document", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/dummy_doc_2_prec.yaml b/test/data/doc/dummy_doc_2_prec.yaml index 814512f9..905f137f 100644 --- a/test/data/doc/dummy_doc_2_prec.yaml +++ b/test/data/doc/dummy_doc_2_prec.yaml @@ -271,4 +271,4 @@ texts: self_ref: '#/texts/3' text: 'Figure 1: Four examples of complex page layouts across different document categories' -version: 1.7.0 +version: 1.8.0 diff --git a/test/data/doc/dummy_doc_with_meta_modified.yaml b/test/data/doc/dummy_doc_with_meta_modified.yaml index bc5e5845..8cc32a81 100644 --- a/test/data/doc/dummy_doc_with_meta_modified.yaml +++ b/test/data/doc/dummy_doc_with_meta_modified.yaml @@ -283,4 +283,4 @@ texts: self_ref: '#/texts/3' text: 'Figure 1: Four examples of complex page layouts across different document categories' -version: 1.7.0 +version: 1.8.0 diff --git a/test/data/doc/group_with_metadata.yaml b/test/data/doc/group_with_metadata.yaml index b88bcbc8..7cfd08d5 100644 --- a/test/data/doc/group_with_metadata.yaml +++ b/test/data/doc/group_with_metadata.yaml @@ -125,4 +125,4 @@ texts: prov: [] self_ref: '#/texts/4' text: Regarding bar... -version: 1.7.0 +version: 1.8.0 diff --git a/test/data/doc/misplaced_list_items.norm.out.yaml b/test/data/doc/misplaced_list_items.norm.out.yaml index 1b33dd76..9fd7edff 100644 --- a/test/data/doc/misplaced_list_items.norm.out.yaml +++ b/test/data/doc/misplaced_list_items.norm.out.yaml @@ -81,4 +81,4 @@ texts: prov: [] self_ref: '#/texts/3' text: there -version: 1.7.0 +version: 1.8.0 diff --git a/test/data/doc/misplaced_list_items.out.yaml b/test/data/doc/misplaced_list_items.out.yaml index c334227e..18b5f978 100644 --- a/test/data/doc/misplaced_list_items.out.yaml +++ b/test/data/doc/misplaced_list_items.out.yaml @@ -81,4 +81,4 @@ texts: prov: [] self_ref: '#/texts/3' text: foo -version: 1.7.0 +version: 1.8.0 diff --git a/test/data/doc/page_with_pic.dt.json b/test/data/doc/page_with_pic.dt.json index b11d817e..966d2e02 100644 --- a/test/data/doc/page_with_pic.dt.json +++ b/test/data/doc/page_with_pic.dt.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Document", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/page_with_pic_from_files.dt.json b/test/data/doc/page_with_pic_from_files.dt.json index b11d817e..966d2e02 100644 --- a/test/data/doc/page_with_pic_from_files.dt.json +++ b/test/data/doc/page_with_pic_from_files.dt.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Document", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/page_without_pic.dt.json b/test/data/doc/page_without_pic.dt.json index 10cd83b9..11a6aede 100644 --- a/test/data/doc/page_without_pic.dt.json +++ b/test/data/doc/page_without_pic.dt.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Document", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/rich_table.out.yaml b/test/data/doc/rich_table.out.yaml index c5f8eecc..76d76a38 100644 --- a/test/data/doc/rich_table.out.yaml +++ b/test/data/doc/rich_table.out.yaml @@ -499,4 +499,4 @@ texts: prov: [] self_ref: '#/texts/5' text: More text in the group. -version: 1.7.0 +version: 1.8.0 diff --git a/test/data/doc/rich_table_item_ins_norm_1.out.yaml b/test/data/doc/rich_table_item_ins_norm_1.out.yaml index fecd739d..bfd0788d 100644 --- a/test/data/doc/rich_table_item_ins_norm_1.out.yaml +++ b/test/data/doc/rich_table_item_ins_norm_1.out.yaml @@ -240,4 +240,4 @@ texts: prov: [] self_ref: '#/texts/1' text: text in italic -version: 1.7.0 +version: 1.8.0 diff --git a/test/data/doc/rich_table_item_ins_norm_2.out.yaml b/test/data/doc/rich_table_item_ins_norm_2.out.yaml index f2e05e5b..8cdfc00d 100644 --- a/test/data/doc/rich_table_item_ins_norm_2.out.yaml +++ b/test/data/doc/rich_table_item_ins_norm_2.out.yaml @@ -250,4 +250,4 @@ texts: prov: [] self_ref: '#/texts/2' text: text before -version: 1.7.0 +version: 1.8.0 diff --git a/test/data/doc/rich_table_item_ins_norm_3.out.yaml b/test/data/doc/rich_table_item_ins_norm_3.out.yaml index b35564ff..71f3e2a0 100644 --- a/test/data/doc/rich_table_item_ins_norm_3.out.yaml +++ b/test/data/doc/rich_table_item_ins_norm_3.out.yaml @@ -250,4 +250,4 @@ texts: prov: [] self_ref: '#/texts/2' text: text in italic -version: 1.7.0 +version: 1.8.0 diff --git a/test/data/doc/rich_table_post_text_del.out.yaml b/test/data/doc/rich_table_post_text_del.out.yaml index 67e086d5..42d71415 100644 --- a/test/data/doc/rich_table_post_text_del.out.yaml +++ b/test/data/doc/rich_table_post_text_del.out.yaml @@ -489,4 +489,4 @@ texts: prov: [] self_ref: '#/texts/4' text: More text in the group. -version: 1.7.0 +version: 1.8.0 diff --git a/test/data/legacy_doc/doc-export.docling.yaml.gt b/test/data/legacy_doc/doc-export.docling.yaml.gt index 4fc4a7fa..3dfac982 100644 --- a/test/data/legacy_doc/doc-export.docling.yaml.gt +++ b/test/data/legacy_doc/doc-export.docling.yaml.gt @@ -6822,4 +6822,4 @@ texts: text: '23. Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for document layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1015-1022. IEEE (2019)' -version: 1.7.0 +version: 1.8.0 From 5aa768f2fe4cca5e5d6472acf1f234cb1e88db76 Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Thu, 30 Oct 2025 08:42:50 +0100 Subject: [PATCH 21/22] make TabularChartMetaField.title optional, expose new classes through __init__.py, add MetaUtils Signed-off-by: Panos Vagenas --- docling_core/types/doc/__init__.py | 11 ++++++ docling_core/types/doc/document.py | 56 +++++++++++++++++------------- docs/DoclingDocument.json | 13 +++++-- test/test_metadata.py | 11 +++--- 4 files changed, 59 insertions(+), 32 deletions(-) diff --git a/docling_core/types/doc/__init__.py b/docling_core/types/doc/__init__.py index 25b5a869..cf0edfef 100644 --- a/docling_core/types/doc/__init__.py +++ b/docling_core/types/doc/__init__.py @@ -9,6 +9,8 @@ from .document import ( AnyTableCell, BaseAnnotation, + BaseMeta, + BasePrediction, ChartBar, ChartLine, ChartPoint, @@ -17,12 +19,14 @@ CodeItem, ContentLayer, DescriptionAnnotation, + DescriptionMetaField, DocItem, DoclingDocument, DocTagsDocument, DocTagsPage, DocumentOrigin, FloatingItem, + FloatingMeta, Formatting, FormItem, FormulaItem, @@ -35,7 +39,10 @@ KeyValueItem, ListGroup, ListItem, + MetaFieldName, + MetaUtils, MiscAnnotation, + MoleculeMetaField, NodeItem, OrderedList, PageItem, @@ -43,9 +50,11 @@ PictureChartData, PictureClassificationClass, PictureClassificationData, + PictureClassificationMetaField, PictureDataType, PictureItem, PictureLineChartData, + PictureMeta, PictureMoleculeData, PicturePieChartData, PictureScatterChartData, @@ -56,9 +65,11 @@ RichTableCell, Script, SectionHeaderItem, + SummaryMetaField, TableCell, TableData, TableItem, + TabularChartMetaField, TextItem, TitleItem, UnorderedList, diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 861e48a4..676e1b91 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -959,7 +959,7 @@ def _copy_without_extra(self) -> Self: ) def _check_custom_field_format(self, key: str) -> None: - parts = key.split(_META_FIELD_NAMESPACE_DELIMITER, maxsplit=1) + parts = key.split(MetaUtils._META_FIELD_NAMESPACE_DELIMITER, maxsplit=1) if len(parts) != 2 or (not parts[0]) or (not parts[1]): raise ValueError( f"Custom meta field name must be in format 'namespace__field_name' (e.g. 'my_corp__max_size'): {key}" @@ -971,7 +971,7 @@ def _validate_field_names(self) -> Self: for key in self.model_dump(): if key in extra_dict: self._check_custom_field_format(key=key) - elif _META_FIELD_NAMESPACE_DELIMITER in key: + elif MetaUtils._META_FIELD_NAMESPACE_DELIMITER in key: raise ValueError( f"Standard meta field name must not contain '__': {key}" ) @@ -985,7 +985,7 @@ def __setattr__(self, name: str, value: Any) -> None: def set_custom_field(self, namespace: str, name: str, value: Any) -> str: """Set a custom field and return the key.""" - key = create_meta_field_name(namespace=namespace, name=name) + key = MetaUtils.create_meta_field_name(namespace=namespace, name=name) setattr(self, key, value) return key @@ -1075,7 +1075,7 @@ class MoleculeMetaField(BasePrediction): class TabularChartMetaField(BasePrediction): """Tabular chart metadata field.""" - title: str + title: Optional[str] = None chart_data: TableData @@ -1555,23 +1555,31 @@ class FormulaItem(TextItem): ) -_META_FIELD_NAMESPACE_DELIMITER = "__" - +class MetaUtils: + """Metadata-related utilities.""" -def create_meta_field_name( - *, - namespace: str, - name: str, -) -> str: - """Create a meta field name.""" - return f"{namespace}{_META_FIELD_NAMESPACE_DELIMITER}{name}" + _META_FIELD_NAMESPACE_DELIMITER: Final = "__" + _META_FIELD_LEGACY_NAMESPACE: Final = "docling_legacy" + @classmethod + def create_meta_field_name( + cls, + *, + namespace: str, + name: str, + ) -> str: + """Create a meta field name.""" + return f"{namespace}{cls._META_FIELD_NAMESPACE_DELIMITER}{name}" -def _create_migrated_meta_field_name( - *, - name: str, -) -> str: - return create_meta_field_name(namespace="docling_legacy", name=name) + @classmethod + def _create_migrated_meta_field_name( + cls, + *, + name: str, + ) -> str: + return cls.create_meta_field_name( + namespace=cls._META_FIELD_LEGACY_NAMESPACE, name=name + ) class PictureItem(FloatingItem): @@ -1639,10 +1647,10 @@ def _migrate_annotations_to_meta(cls, data: Any) -> Any: confidence=ann.confidence, created_by=ann.provenance, **{ - _create_migrated_meta_field_name( + MetaUtils._create_migrated_meta_field_name( name="segmentation" ): ann.segmentation, - _create_migrated_meta_field_name( + MetaUtils._create_migrated_meta_field_name( name="class_name" ): ann.class_name, }, @@ -1658,13 +1666,13 @@ def _migrate_annotations_to_meta(cls, data: Any) -> Any: ) elif isinstance(ann, MiscAnnotation): data["meta"].setdefault( - _create_migrated_meta_field_name(name=ann.kind), + MetaUtils._create_migrated_meta_field_name(name=ann.kind), ann.content, ) else: # fall back to reusing original annotation type name (in namespaced format) data["meta"].setdefault( - _create_migrated_meta_field_name(name=ann.kind), + MetaUtils._create_migrated_meta_field_name(name=ann.kind), ann.model_dump(mode="json"), ) @@ -1853,13 +1861,13 @@ def migrate_annotations_to_meta(cls, data: Any) -> Any: ) elif isinstance(ann, MiscAnnotation): data["meta"].setdefault( - _create_migrated_meta_field_name(name=ann.kind), + MetaUtils._create_migrated_meta_field_name(name=ann.kind), ann.content, ) else: # fall back to reusing original annotation type name (in namespaced format) data["meta"].setdefault( - _create_migrated_meta_field_name(name=ann.kind), + MetaUtils._create_migrated_meta_field_name(name=ann.kind), ann.model_dump(mode="json"), ) diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index 9c8d5939..a1dd28ca 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -2642,15 +2642,22 @@ "title": "Created By" }, "title": { - "title": "Title", - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Title" }, "chart_data": { "$ref": "#/$defs/TableData" } }, "required": [ - "title", "chart_data" ], "title": "TabularChartMetaField", diff --git a/test/test_metadata.py b/test/test_metadata.py index aff10db7..2eec3e8d 100644 --- a/test/test_metadata.py +++ b/test/test_metadata.py @@ -12,17 +12,18 @@ MarkdownMetaSerializer, MarkdownParams, ) -from docling_core.types.doc.document import ( +from docling_core.types.doc import ( BaseMeta, DocItem, + DocItemLabel, DoclingDocument, + GroupLabel, MetaFieldName, + MetaUtils, NodeItem, RefItem, SummaryMetaField, - create_meta_field_name, ) -from docling_core.types.doc.labels import DocItemLabel, GroupLabel from .test_data_gen_flag import GEN_TEST_DATA @@ -171,7 +172,7 @@ def test_md_ser_allowed_meta_names(): doc = _create_doc_with_group_with_metadata() params = MarkdownParams( allowed_meta_names={ - create_meta_field_name(namespace="my_corp", name="test_1"), + MetaUtils.create_meta_field_name(namespace="my_corp", name="test_1"), }, mark_meta=True, ) @@ -192,7 +193,7 @@ def test_md_ser_blocked_meta_names(): doc = _create_doc_with_group_with_metadata() params = MarkdownParams( blocked_meta_names={ - create_meta_field_name(namespace="my_corp", name="test_1"), + MetaUtils.create_meta_field_name(namespace="my_corp", name="test_1"), MetaFieldName.SUMMARY.value, }, mark_meta=True, From 37982d9992f219f5d40acabb4a29562d875ad808 Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Thu, 30 Oct 2025 09:23:02 +0100 Subject: [PATCH 22/22] add DocTags serialization, revert smiles to smi to prevent confusion with plural Signed-off-by: Panos Vagenas --- docling_core/transforms/serializer/doctags.py | 55 ++++++++++++------- docling_core/transforms/serializer/html.py | 2 +- .../transforms/serializer/markdown.py | 2 +- docling_core/types/doc/document.py | 6 +- docs/DoclingDocument.json | 6 +- test/data/doc/dummy_doc_2_prec.yaml | 2 +- .../doc/dummy_doc_with_meta_modified.yaml | 2 +- 7 files changed, 45 insertions(+), 30 deletions(-) diff --git a/docling_core/transforms/serializer/doctags.py b/docling_core/transforms/serializer/doctags.py index 844d0096..0195cd8e 100644 --- a/docling_core/transforms/serializer/doctags.py +++ b/docling_core/transforms/serializer/doctags.py @@ -44,6 +44,7 @@ PictureTabularChartData, ProvenanceItem, SectionHeaderItem, + TableData, TableItem, TextItem, ) @@ -233,13 +234,22 @@ def serialize( ysize=params.ysize, ) - classifications = [ - ann - for ann in item.annotations - if isinstance(ann, PictureClassificationData) - ] - if len(classifications) > 0: + # handle classification data + predicted_class: Optional[str] = None + if item.meta and item.meta.classification: + predicted_class = ( + item.meta.classification.get_main_prediction().class_name + ) + elif ( + classifications := [ + ann + for ann in item.annotations + if isinstance(ann, PictureClassificationData) + ] + ) and classifications[0].predicted_classes: predicted_class = classifications[0].predicted_classes[0].class_name + if predicted_class: + body += DocumentToken.get_picture_classification_token(predicted_class) if predicted_class in [ PictureClassificationLabel.PIE_CHART, PictureClassificationLabel.BAR_CHART, @@ -250,26 +260,31 @@ def serialize( PictureClassificationLabel.HEATMAP, ]: is_chart = True - body += DocumentToken.get_picture_classification_token(predicted_class) - smiles_annotations = [ + # handle molecule data + smi: Optional[str] = None + if item.meta and item.meta.molecule: + smi = item.meta.molecule.smi + elif smiles_annotations := [ ann for ann in item.annotations if isinstance(ann, PictureMoleculeData) - ] - if len(smiles_annotations) > 0: - body += _wrap( - text=smiles_annotations[0].smi, wrap_tag=DocumentToken.SMILES.value - ) - - tabular_chart_annotations = [ + ]: + smi = smiles_annotations[0].smi + if smi: + body += _wrap(text=smi, wrap_tag=DocumentToken.SMILES.value) + + # handle tabular chart data + chart_data: Optional[TableData] = None + if item.meta and item.meta.tabular_chart: + chart_data = item.meta.tabular_chart.chart_data + elif tabular_chart_annotations := [ ann for ann in item.annotations if isinstance(ann, PictureTabularChartData) - ] - if len(tabular_chart_annotations) > 0: + ]: + chart_data = tabular_chart_annotations[0].chart_data + if chart_data and chart_data.table_cells: temp_doc = DoclingDocument(name="temp") - temp_table = temp_doc.add_table( - data=tabular_chart_annotations[0].chart_data - ) + temp_table = temp_doc.add_table(data=chart_data) otsl_content = temp_table.export_to_otsl( temp_doc, add_cell_location=False ) diff --git a/docling_core/transforms/serializer/html.py b/docling_core/transforms/serializer/html.py index aad194c1..71bd798e 100644 --- a/docling_core/transforms/serializer/html.py +++ b/docling_core/transforms/serializer/html.py @@ -865,7 +865,7 @@ def _serialize_meta_field(self, meta: BaseMeta, name: str) -> Optional[str]: elif isinstance(field_val, PictureClassificationMetaField): txt = self._humanize_text(field_val.get_main_prediction().class_name) elif isinstance(field_val, MoleculeMetaField): - txt = field_val.smiles + txt = field_val.smi elif isinstance(field_val, TabularChartMetaField): # suppressing tabular chart serialization return None diff --git a/docling_core/transforms/serializer/markdown.py b/docling_core/transforms/serializer/markdown.py index f82ff9ff..6292761d 100644 --- a/docling_core/transforms/serializer/markdown.py +++ b/docling_core/transforms/serializer/markdown.py @@ -313,7 +313,7 @@ def _serialize_meta_field( elif isinstance(field_val, PictureClassificationMetaField): txt = self._humanize_text(field_val.get_main_prediction().class_name) elif isinstance(field_val, MoleculeMetaField): - txt = field_val.smiles + txt = field_val.smi elif isinstance(field_val, TabularChartMetaField): # suppressing tabular chart serialization return None diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 676e1b91..626a9734 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -1017,7 +1017,7 @@ class SummaryMetaField(BasePrediction): text: str -# NOTE: should be manually kept in sync with top-level BaseMeta hierarchy fields +# NOTE: must be manually kept in sync with top-level BaseMeta hierarchy fields class MetaFieldName(str, Enum): """Standard meta field names.""" @@ -1069,7 +1069,7 @@ def get_main_prediction(self) -> PictureClassificationPrediction: class MoleculeMetaField(BasePrediction): """Molecule metadata field.""" - smiles: str = Field(description="The SMILES representation of the molecule.") + smi: str = Field(description="The SMILES representation of the molecule.") class TabularChartMetaField(BasePrediction): @@ -1643,7 +1643,7 @@ def _migrate_annotations_to_meta(cls, data: Any) -> Any: data["meta"].setdefault( MetaFieldName.MOLECULE.value, MoleculeMetaField( - smiles=ann.smi, + smi=ann.smi, confidence=ann.confidence, created_by=ann.provenance, **{ diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index a1dd28ca..a99785ff 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -1421,14 +1421,14 @@ ], "title": "Created By" }, - "smiles": { + "smi": { "description": "The SMILES representation of the molecule.", - "title": "Smiles", + "title": "Smi", "type": "string" } }, "required": [ - "smiles" + "smi" ], "title": "MoleculeMetaField", "type": "object" diff --git a/test/data/doc/dummy_doc_2_prec.yaml b/test/data/doc/dummy_doc_2_prec.yaml index 905f137f..03b3fa95 100644 --- a/test/data/doc/dummy_doc_2_prec.yaml +++ b/test/data/doc/dummy_doc_2_prec.yaml @@ -107,7 +107,7 @@ pictures: - 1.0 - - 1.0 - 1.0 - smiles: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 + smi: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 parent: $ref: '#/body' prov: diff --git a/test/data/doc/dummy_doc_with_meta_modified.yaml b/test/data/doc/dummy_doc_with_meta_modified.yaml index 8cc32a81..f7334672 100644 --- a/test/data/doc/dummy_doc_with_meta_modified.yaml +++ b/test/data/doc/dummy_doc_with_meta_modified.yaml @@ -107,7 +107,7 @@ pictures: - 1.0 - - 1.0 - 1.0 - smiles: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 + smi: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 parent: $ref: '#/body' prov: