diff --git a/docling_core/transforms/serializer/base.py b/docling_core/transforms/serializer/base.py index dc4f2eee..67d0a727 100644 --- a/docling_core/transforms/serializer/base.py +++ b/docling_core/transforms/serializer/base.py @@ -9,6 +9,7 @@ from typing import Any, Optional, Union from pydantic import AnyUrl, BaseModel +from typing_extensions import deprecated from docling_core.types.doc.document import ( DocItem, @@ -258,6 +259,7 @@ def serialize_captions( """Serialize the item's captions.""" ... + @deprecated("Use serialize_meta() instead.") @abstractmethod def serialize_annotations( self, @@ -267,6 +269,15 @@ def serialize_annotations( """Serialize the item's annotations.""" ... + @abstractmethod + def serialize_meta( + self, + item: NodeItem, + **kwargs: Any, + ) -> SerializationResult: + """Serialize the item's meta.""" + ... + @abstractmethod def get_excluded_refs(self, **kwargs: Any) -> set[str]: """Get references to excluded items.""" @@ -287,6 +298,26 @@ def get_serializer(self, doc: DoclingDocument) -> BaseDocSerializer: ... +class BaseMetaSerializer(ABC): + """Base class for meta serializers.""" + + @abstractmethod + def serialize( + self, + *, + item: NodeItem, + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + """Serializes the meta of the passed item.""" + ... + + def _humanize_text(self, text: str, title: bool = False) -> str: + tmp = text.replace("__", "_").replace("_", " ") + return tmp.title() if title else tmp.capitalize() + + +@deprecated("Use BaseMetaSerializer() instead.") class BaseAnnotationSerializer(ABC): """Base class for annotation serializers.""" diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py index f5d80af9..4720ada0 100644 --- a/docling_core/transforms/serializer/common.py +++ b/docling_core/transforms/serializer/common.py @@ -4,6 +4,7 @@ # """Define base classes for serialization.""" +import logging import re import sys from abc import abstractmethod @@ -11,7 +12,14 @@ from pathlib import Path from typing import Any, Iterable, Optional, Tuple, Union -from pydantic import AnyUrl, BaseModel, ConfigDict, NonNegativeInt, computed_field +from pydantic import ( + AnyUrl, + BaseModel, + ConfigDict, + Field, + NonNegativeInt, + computed_field, +) from typing_extensions import Self, override from docling_core.transforms.serializer.base import ( @@ -22,6 +30,7 @@ BaseInlineSerializer, BaseKeyValueSerializer, BaseListSerializer, + BaseMetaSerializer, BasePictureSerializer, BaseTableSerializer, BaseTextSerializer, @@ -56,6 +65,9 @@ _DEFAULT_LAYERS = {cl for cl in ContentLayer} +_logger = logging.getLogger(__name__) + + class _PageBreakNode(NodeItem): """Page break node.""" @@ -76,11 +88,11 @@ def _iterate_items( traverse_pictures: bool = False, add_page_breaks: bool = False, visited: Optional[set[str]] = None, -): +) -> Iterable[Tuple[NodeItem, int]]: my_visited: set[str] = visited if visited is not None else set() prev_page_nr: Optional[int] = None page_break_i = 0 - for item, _ in doc.iterate_items( + for item, lvl in doc.iterate_items( root=node, with_groups=True, included_content_layers=layers, @@ -93,7 +105,7 @@ def _iterate_items( ): # if group starts with new page, yield page break before group node my_visited.add(item.self_ref) - for it in _iterate_items( + for it, _ in _iterate_items( doc=doc, layers=layers, node=item, @@ -108,7 +120,7 @@ def _iterate_items( self_ref=f"#/pb/{page_break_i}", prev_page=prev_page_nr, next_page=page_no, - ) + ), lvl break elif isinstance(item, DocItem) and item.prov: page_no = item.prov[0].page_no @@ -118,10 +130,10 @@ def _iterate_items( self_ref=f"#/pb/{page_break_i}", prev_page=prev_page_nr, next_page=page_no, - ) + ), lvl page_break_i += 1 prev_page_nr = page_no - yield item + yield item, lvl def _get_annotation_text( @@ -188,9 +200,22 @@ class CommonParams(BaseModel): start_idx: NonNegativeInt = 0 stop_idx: NonNegativeInt = sys.maxsize + include_non_meta: bool = True + include_formatting: bool = True include_hyperlinks: bool = True caption_delim: str = " " + use_legacy_annotations: bool = Field( + default=False, description="Use legacy annotation serialization." + ) + allowed_meta_names: Optional[set[str]] = Field( + default=None, + description="Meta name to allow; None means all meta names are allowed.", + ) + blocked_meta_names: set[str] = Field( + default_factory=set, + description="Meta name to block; takes precedence over allowed_meta_names.", + ) def merge_with_patch(self, patch: dict[str, Any]) -> Self: """Create an instance by merging the provided patch dict on top of self.""" @@ -215,6 +240,7 @@ class DocSerializer(BaseModel, BaseDocSerializer): list_serializer: BaseListSerializer inline_serializer: BaseInlineSerializer + meta_serializer: Optional[BaseMetaSerializer] = None annotation_serializer: BaseAnnotationSerializer params: CommonParams = CommonParams() @@ -245,7 +271,7 @@ def get_excluded_refs(self, **kwargs: Any) -> set[str]: if refs is None: refs = { item.self_ref - for ix, item in enumerate( + for ix, (item, _) in enumerate( _iterate_items( doc=self.doc, traverse_pictures=True, @@ -301,103 +327,130 @@ def serialize( ) -> SerializationResult: """Serialize a given node.""" my_visited: set[str] = visited if visited is not None else set() + parts: list[SerializationResult] = [] + delim: str = kwargs.get("delim", "\n") + my_params = self.params.model_copy(update=kwargs) my_kwargs = {**self.params.model_dump(), **kwargs} empty_res = create_ser_result() - if item is None or item == self.doc.body: - if self.doc.body.self_ref not in my_visited: - my_visited.add(self.doc.body.self_ref) - return self._serialize_body(**my_kwargs) + + my_item = item or self.doc.body + + if my_item == self.doc.body: + if my_item.meta and not my_params.use_legacy_annotations: + meta_part = self.serialize_meta(item=my_item, **my_kwargs) + if meta_part.text: + parts.append(meta_part) + + if my_item.self_ref not in my_visited: + my_visited.add(my_item.self_ref) + part = self._serialize_body(**my_kwargs) + if part.text: + parts.append(part) + return create_ser_result( + text=delim.join([p.text for p in parts if p.text]), + span_source=parts, + ) else: return empty_res - my_visited.add(item.self_ref) - - ######## - # groups - ######## - if isinstance(item, ListGroup): - part = self.list_serializer.serialize( - item=item, - doc_serializer=self, - doc=self.doc, - list_level=list_level, - is_inline_scope=is_inline_scope, - visited=my_visited, - **my_kwargs, - ) - elif isinstance(item, InlineGroup): - part = self.inline_serializer.serialize( - item=item, - doc_serializer=self, - doc=self.doc, - list_level=list_level, - visited=my_visited, - **my_kwargs, - ) - ########### - # doc items - ########### - elif isinstance(item, TextItem): - if item.self_ref in self._captions_of_some_item: - # those captions will be handled by the floating item holding them - return empty_res - else: - part = ( - self.text_serializer.serialize( - item=item, - doc_serializer=self, - doc=self.doc, - is_inline_scope=is_inline_scope, - visited=my_visited, - **my_kwargs, + my_visited.add(my_item.self_ref) + + if my_item.meta and not my_params.use_legacy_annotations: + meta_part = self.serialize_meta(item=my_item, **my_kwargs) + if meta_part.text: + parts.append(meta_part) + + if my_params.include_non_meta: + ######## + # groups + ######## + if isinstance(my_item, ListGroup): + part = self.list_serializer.serialize( + item=my_item, + doc_serializer=self, + doc=self.doc, + list_level=list_level, + is_inline_scope=is_inline_scope, + visited=my_visited, + **my_kwargs, + ) + elif isinstance(my_item, InlineGroup): + part = self.inline_serializer.serialize( + item=my_item, + doc_serializer=self, + doc=self.doc, + list_level=list_level, + visited=my_visited, + **my_kwargs, + ) + ########### + # doc items + ########### + elif isinstance(my_item, TextItem): + if my_item.self_ref in self._captions_of_some_item: + # those captions will be handled by the floating item holding them + return empty_res + else: + part = ( + self.text_serializer.serialize( + item=my_item, + doc_serializer=self, + doc=self.doc, + is_inline_scope=is_inline_scope, + visited=my_visited, + **my_kwargs, + ) + if my_item.self_ref not in self.get_excluded_refs(**kwargs) + else empty_res ) - if item.self_ref not in self.get_excluded_refs(**kwargs) - else empty_res + elif isinstance(my_item, TableItem): + part = self.table_serializer.serialize( + item=my_item, + doc_serializer=self, + doc=self.doc, + visited=my_visited, + **my_kwargs, ) - elif isinstance(item, TableItem): - part = self.table_serializer.serialize( - item=item, - doc_serializer=self, - doc=self.doc, - visited=my_visited, - **my_kwargs, - ) - elif isinstance(item, PictureItem): - part = self.picture_serializer.serialize( - item=item, - doc_serializer=self, - doc=self.doc, - visited=my_visited, - **my_kwargs, - ) - elif isinstance(item, KeyValueItem): - part = self.key_value_serializer.serialize( - item=item, - doc_serializer=self, - doc=self.doc, - **my_kwargs, - ) - elif isinstance(item, FormItem): - part = self.form_serializer.serialize( - item=item, - doc_serializer=self, - doc=self.doc, - **my_kwargs, - ) - elif isinstance(item, _PageBreakNode): - part = _PageBreakSerResult( - text=self._create_page_break(node=item), - node=item, - ) - else: - part = self.fallback_serializer.serialize( - item=item, - doc_serializer=self, - doc=self.doc, - visited=my_visited, - **my_kwargs, - ) - return part + elif isinstance(my_item, PictureItem): + part = self.picture_serializer.serialize( + item=my_item, + doc_serializer=self, + doc=self.doc, + visited=my_visited, + **my_kwargs, + ) + elif isinstance(my_item, KeyValueItem): + part = self.key_value_serializer.serialize( + item=my_item, + doc_serializer=self, + doc=self.doc, + **my_kwargs, + ) + elif isinstance(my_item, FormItem): + part = self.form_serializer.serialize( + item=my_item, + doc_serializer=self, + doc=self.doc, + **my_kwargs, + ) + elif isinstance(my_item, _PageBreakNode): + part = _PageBreakSerResult( + text=self._create_page_break(node=my_item), + node=my_item, + ) + else: + part = self.fallback_serializer.serialize( + item=my_item, + doc_serializer=self, + doc=self.doc, + visited=my_visited, + **my_kwargs, + ) + parts.append(part) + + return create_ser_result( + text=delim.join([p.text for p in parts if p.text]), span_source=parts + ) # making some assumptions about the kwargs it can pass @override @@ -416,7 +469,7 @@ def get_parts( my_visited: set[str] = visited if visited is not None else set() params = self.params.merge_with_patch(patch=kwargs) - for node in _iterate_items( + for node, lvl in _iterate_items( node=item, doc=self.doc, layers=params.layers, @@ -426,15 +479,17 @@ def get_parts( continue else: my_visited.add(node.self_ref) + part = self.serialize( item=node, list_level=list_level, is_inline_scope=is_inline_scope, visited=my_visited, - **kwargs, + **(dict(level=lvl) | kwargs), ) if part.text: parts.append(part) + return parts @override @@ -528,6 +583,31 @@ def serialize_captions( text_res = "" return create_ser_result(text=text_res, span_source=results) + @override + def serialize_meta( + self, + item: NodeItem, + **kwargs: Any, + ) -> SerializationResult: + """Serialize the item's meta.""" + if self.meta_serializer: + if item.self_ref not in self.get_excluded_refs(**kwargs): + return self.meta_serializer.serialize( + item=item, + doc=self.doc, + **(self.params.model_dump() | kwargs), + ) + else: + return create_ser_result( + text="", span_source=item if isinstance(item, DocItem) else [] + ) + else: + _logger.warning("No meta serializer found.") + return create_ser_result( + text="", span_source=item if isinstance(item, DocItem) else [] + ) + + # TODO deprecate @override def serialize_annotations( self, diff --git a/docling_core/transforms/serializer/doctags.py b/docling_core/transforms/serializer/doctags.py index 844d0096..0195cd8e 100644 --- a/docling_core/transforms/serializer/doctags.py +++ b/docling_core/transforms/serializer/doctags.py @@ -44,6 +44,7 @@ PictureTabularChartData, ProvenanceItem, SectionHeaderItem, + TableData, TableItem, TextItem, ) @@ -233,13 +234,22 @@ def serialize( ysize=params.ysize, ) - classifications = [ - ann - for ann in item.annotations - if isinstance(ann, PictureClassificationData) - ] - if len(classifications) > 0: + # handle classification data + predicted_class: Optional[str] = None + if item.meta and item.meta.classification: + predicted_class = ( + item.meta.classification.get_main_prediction().class_name + ) + elif ( + classifications := [ + ann + for ann in item.annotations + if isinstance(ann, PictureClassificationData) + ] + ) and classifications[0].predicted_classes: predicted_class = classifications[0].predicted_classes[0].class_name + if predicted_class: + body += DocumentToken.get_picture_classification_token(predicted_class) if predicted_class in [ PictureClassificationLabel.PIE_CHART, PictureClassificationLabel.BAR_CHART, @@ -250,26 +260,31 @@ def serialize( PictureClassificationLabel.HEATMAP, ]: is_chart = True - body += DocumentToken.get_picture_classification_token(predicted_class) - smiles_annotations = [ + # handle molecule data + smi: Optional[str] = None + if item.meta and item.meta.molecule: + smi = item.meta.molecule.smi + elif smiles_annotations := [ ann for ann in item.annotations if isinstance(ann, PictureMoleculeData) - ] - if len(smiles_annotations) > 0: - body += _wrap( - text=smiles_annotations[0].smi, wrap_tag=DocumentToken.SMILES.value - ) - - tabular_chart_annotations = [ + ]: + smi = smiles_annotations[0].smi + if smi: + body += _wrap(text=smi, wrap_tag=DocumentToken.SMILES.value) + + # handle tabular chart data + chart_data: Optional[TableData] = None + if item.meta and item.meta.tabular_chart: + chart_data = item.meta.tabular_chart.chart_data + elif tabular_chart_annotations := [ ann for ann in item.annotations if isinstance(ann, PictureTabularChartData) - ] - if len(tabular_chart_annotations) > 0: + ]: + chart_data = tabular_chart_annotations[0].chart_data + if chart_data and chart_data.table_cells: temp_doc = DoclingDocument(name="temp") - temp_table = temp_doc.add_table( - data=tabular_chart_annotations[0].chart_data - ) + temp_table = temp_doc.add_table(data=chart_data) otsl_content = temp_table.export_to_otsl( temp_doc, add_cell_location=False ) diff --git a/docling_core/transforms/serializer/html.py b/docling_core/transforms/serializer/html.py index 98e5cf7d..71bd798e 100644 --- a/docling_core/transforms/serializer/html.py +++ b/docling_core/transforms/serializer/html.py @@ -17,7 +17,7 @@ import latex2mathml.converter from PIL.Image import Image -from pydantic import AnyUrl, BaseModel +from pydantic import AnyUrl, BaseModel, Field from typing_extensions import override from docling_core.transforms.serializer.base import ( @@ -28,6 +28,7 @@ BaseInlineSerializer, BaseKeyValueSerializer, BaseListSerializer, + BaseMetaSerializer, BasePictureSerializer, BaseTableSerializer, BaseTextSerializer, @@ -46,9 +47,11 @@ from docling_core.transforms.visualizer.base import BaseVisualizer from docling_core.types.doc.base import ImageRefMode from docling_core.types.doc.document import ( + BaseMeta, CodeItem, ContentLayer, DescriptionAnnotation, + DescriptionMetaField, DocItem, DoclingDocument, FloatingItem, @@ -61,14 +64,18 @@ KeyValueItem, ListGroup, ListItem, + MoleculeMetaField, NodeItem, PictureClassificationData, + PictureClassificationMetaField, PictureItem, PictureMoleculeData, PictureTabularChartData, RichTableCell, SectionHeaderItem, + SummaryMetaField, TableItem, + TabularChartMetaField, TextItem, TitleItem, ) @@ -115,7 +122,11 @@ class HTMLParams(CommonParams): # Enable charts to be printed into HTML as tables enable_chart_tables: bool = True - include_annotations: bool = True + include_annotations: bool = Field( + default=True, + description="Include item annotations.", + deprecated="Use include_meta instead.", + ) show_original_list_item_marker: bool = True @@ -808,6 +819,65 @@ def serialize( ) +class HTMLMetaSerializer(BaseModel, BaseMetaSerializer): + """HTML-specific meta serializer.""" + + @override + def serialize( + self, + *, + item: NodeItem, + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + """Serialize the item's meta.""" + params = HTMLParams(**kwargs) + return create_ser_result( + text="\n".join( + [ + tmp + for key in ( + list(item.meta.__class__.model_fields) + + list(item.meta.get_custom_part()) + ) + if ( + ( + params.allowed_meta_names is None + or key in params.allowed_meta_names + ) + and (key not in params.blocked_meta_names) + and (tmp := self._serialize_meta_field(item.meta, key)) + ) + ] + if item.meta + else [] + ), + span_source=item if isinstance(item, DocItem) else [], + # NOTE for now using an empty span source for GroupItems + ) + + def _serialize_meta_field(self, meta: BaseMeta, name: str) -> Optional[str]: + if (field_val := getattr(meta, name)) is not None: + if isinstance(field_val, SummaryMetaField): + txt = field_val.text + elif isinstance(field_val, DescriptionMetaField): + txt = field_val.text + elif isinstance(field_val, PictureClassificationMetaField): + txt = self._humanize_text(field_val.get_main_prediction().class_name) + elif isinstance(field_val, MoleculeMetaField): + txt = field_val.smi + elif isinstance(field_val, TabularChartMetaField): + # suppressing tabular chart serialization + return None + elif tmp := str(field_val or ""): + txt = tmp + else: + return None + return f"
Version 1.0
Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar
AI4K Group, IBM Research R¨ uschlikon, Switzerland
diff --git a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_true.gt.html b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_true.gt.html index 0bb79d05..7f10d0ac 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_true.gt.html +++ b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_true.gt.html @@ -4,7 +4,7 @@Version 1.0
Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar
AI4K Group, IBM Research R¨ uschlikon, Switzerland
diff --git a/test/data/doc/2408.09869v3_enriched_p1_mark_meta_true.gt.md b/test/data/doc/2408.09869v3_enriched_p1_mark_meta_true.gt.md new file mode 100644 index 00000000..3f8a9266 --- /dev/null +++ b/test/data/doc/2408.09869v3_enriched_p1_mark_meta_true.gt.md @@ -0,0 +1,51 @@ +# Docling Technical Report + +[Description] In this image we can see a cartoon image of a duck holding a paper. + + + +Version 1.0 + +Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar + +AI4K Group, IBM Research R¨ uschlikon, Switzerland + +## Abstract + +This technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source package for PDF document conversion. It is powered by state-of-the-art specialized AI models for layout analysis (DocLayNet) and table structure recognition (TableFormer), and runs efficiently on commodity hardware in a small resource budget. The code interface allows for easy extensibility and addition of new features and models. + +## 1 Introduction + +Converting PDF documents back into a machine-processable format has been a major challenge for decades due to their huge variability in formats, weak standardization and printing-optimized characteristic, which discards most structural features and metadata. With the advent of LLMs and popular application patterns such as retrieval-augmented generation (RAG), leveraging the rich content embedded in PDFs has become ever more relevant. In the past decade, several powerful document understanding solutions have emerged on the market, most of which are commercial software, cloud offerings [3] and most recently, multi-modal vision-language models. As of today, only a handful of open-source tools cover PDF conversion, leaving a significant feature and quality gap to proprietary solutions. + +With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models. + +torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report. + +[Docling Legacy Misc] {'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} + +summary: Typical Docling setup runtime characterization. +type: performance data + +Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads. + +| CPU | Thread budget | native backend | native backend | native backend | pypdfium backend | pypdfium backend | pypdfium backend | +|----------------------------------|-----------------|------------------|------------------|------------------|--------------------|--------------------|--------------------| +| | | TTS | Pages/s | Mem | TTS | Pages/s | Mem | +| Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB | +| (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB | + +## 5 Applications + +Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets. + +## 6 Future work and contributions + +Docling is designed to allow easy extension of the model library and pipelines. In the future, we plan to extend Docling with several more models, such as a figure-classifier model, an equationrecognition model, a code-recognition model and more. This will help improve the quality of conversion for specific types of content, as well as augment extracted document metadata with additional information. Further investment into testing and optimizing GPU acceleration as well as improving the Docling-native PDF backend are on our roadmap, too. + +We encourage everyone to propose or implement additional features and models, and will gladly take your inputs and contributions under review . The codebase of Docling is open for use and contribution, under the MIT license agreement and in alignment with our contributing guidelines included in the Docling repository. If you use Docling in your projects, please consider citing this technical report. + +## References + +- [1] J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/ JaidedAI/EasyOCR , 2024. Version: 1.7.0. +- [2] J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. Voznesensky, B. Bao, P. Bell, D. Berard, E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison, W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. Lazos, M. Lezcano, Y. Liang, J. Liang, Y. Lu, C. Luk, B. Maher, Y. Pan, C. Puhrsch, M. Reso, M. Saroufim, M. Y. Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang, X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S. Chintala. Pytorch 2: Faster diff --git a/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_true.gt.md b/test/data/doc/2408.09869v3_enriched_p1_use_legacy_annotations_true_mark_annotations_true.gt.md similarity index 100% rename from test/data/doc/2408.09869v3_enriched_p1_mark_annotations_true.gt.md rename to test/data/doc/2408.09869v3_enriched_p1_use_legacy_annotations_true_mark_annotations_true.gt.md diff --git a/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.html b/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.html index f728cdb3..00bf0385 100644 --- a/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.html +++ b/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.html @@ -126,7 +126,8 @@licensing (e.g. pymupdf [7]), poor speed or unrecoverable quality issues, such as merged text cells across far-apart text tokens or table columns (pypdfium, PyPDF) [15, 14].
We therefore decided to provide multiple backend choices, and additionally open-source a custombuilt PDF parser, which is based on the low-level qpdf [4] library. It is made available in a separate package named docling-parse and powers the default PDF backend in Docling. As an alternative, we provide a PDF backend relying on pypdfium , which may be a safe backup choice in certain cases, e.g. if issues are seen with particular font encodings.
torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report.
+| CPU | Thread budget | native backend | pypdfium backend | ||||
|---|---|---|---|---|---|---|---|
| TTS | Pages/s | Mem | TTS | Pages/s | Mem | ||
| Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB |
| (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB |
Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets.
diff --git a/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json b/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json index 7bbddf7b..7a501c17 100644 --- a/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json +++ b/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "2408.09869v3", "furniture": { "self_ref": "#/furniture", @@ -1901,6 +1901,12 @@ } ], "content_layer": "body", + "meta": { + "description": { + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image, we can see some text and images." + } + }, "label": "picture", "prov": [ { @@ -1955,6 +1961,12 @@ } ], "content_layer": "body", + "meta": { + "docling_legacy__misc": { + "summary": "Typical Docling setup runtime characterization.", + "type": "performance data" + } + }, "label": "table", "prov": [ { diff --git a/test/data/doc/2408.09869v3_enriched_split.gt.html b/test/data/doc/2408.09869v3_enriched_split.gt.html index 33c39bc5..1adaa3d9 100644 --- a/test/data/doc/2408.09869v3_enriched_split.gt.html +++ b/test/data/doc/2408.09869v3_enriched_split.gt.html @@ -96,7 +96,8 @@Version 1.0
Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar
AI4K Group, IBM Research R¨ uschlikon, Switzerland
@@ -147,7 +148,8 @@licensing (e.g. pymupdf [7]), poor speed or unrecoverable quality issues, such as merged text cells across far-apart text tokens or table columns (pypdfium, PyPDF) [15, 14].
We therefore decided to provide multiple backend choices, and additionally open-source a custombuilt PDF parser, which is based on the low-level qpdf [4] library. It is made available in a separate package named docling-parse and powers the default PDF backend in Docling. As an alternative, we provide a PDF backend relying on pypdfium , which may be a safe backup choice in certain cases, e.g. if issues are seen with particular font encodings.
torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report.
+| CPU | Thread budget | native backend | pypdfium backend | ||||
|---|---|---|---|---|---|---|---|
| TTS | Pages/s | Mem | TTS | Pages/s | Mem | ||
| Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB |
| (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB |
Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets.
@@ -273,12 +276,16 @@PDF document conversion, layout segmentation, object-detection, data set, Machine Learning
Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043
-AGL Energy Limited ABN 74 1
5 061 375
-Figure 1: Four examples of complex page layouts across different document categories
PDF document conversion, layout segmentation, object-detection, data set, Machine Learning
@@ -303,11 +310,12 @@to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity.
The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this
-Third, achienec
chalenongayouls ground-vuth dawa such WC
-Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.
paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.
In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16].
@@ -341,18 +350,22 @@Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as %
between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.
of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric
-| class label | Count | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|
| Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten | ||
| Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a |
| Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 |
| Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a |
| List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 |
| Page-footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 |
| Page-header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 |
| Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 |
| Section-header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 |
| Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 |
| Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 |
| Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 |
| Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 |
include publication repositories such as arXiv
Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-
annotated pages, from which we obtain accuracy ranges.
-| % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | ||
|---|---|---|---|---|---|---|---|---|---|---|---|
| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten |
| Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a |
| Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 |
| Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a |
| List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 |
| Page- footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 |
| Page- header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 |
| Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 |
| Section- header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 |
| Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 |
| Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 |
| Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 |
| Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 |
3
,
@@ -363,7 +376,8 @@Title and
.
page. Specificity ensures that the choice of label is not ambiguous,
-we distributed the annotation workload and performed continuous be annotated. We refrained from class labels that are very specific
only. For phases three and four, a group of 40 dedicated annotators while coverage ensures that all meaningful items on a page can
quality controls. Phase one and two required a small team of experts to a document category, such as
diff --git a/test/data/doc/barchart.gt.html b/test/data/doc/barchart.gt.html index 6ee917ef..05cbb81f 100644 --- a/test/data/doc/barchart.gt.html +++ b/test/data/doc/barchart.gt.html @@ -124,7 +124,8 @@| Number of impellers | single-frequency | multi-frequency |
| 1 | 0.06 | 0.16 |
| 2 | 0.12 | 0.26 |
| 3 | 0.16 | 0.27 |
| 4 | 0.14 | 0.26 |
| 5 | 0.16 | 0.25 |
| 6 | 0.24 | 0.24 |
| Number of impellers | single-frequency | multi-frequency |
| 1 | 0.06 | 0.16 |
| 2 | 0.12 | 0.26 |
| 3 | 0.16 | 0.27 |
| 4 | 0.14 | 0.26 |
| 5 | 0.16 | 0.25 |
| 6 | 0.24 | 0.24 |