diff --git a/docling_core/transforms/serializer/base.py b/docling_core/transforms/serializer/base.py index dc4f2eee..67d0a727 100644 --- a/docling_core/transforms/serializer/base.py +++ b/docling_core/transforms/serializer/base.py @@ -9,6 +9,7 @@ from typing import Any, Optional, Union from pydantic import AnyUrl, BaseModel +from typing_extensions import deprecated from docling_core.types.doc.document import ( DocItem, @@ -258,6 +259,7 @@ def serialize_captions( """Serialize the item's captions.""" ... + @deprecated("Use serialize_meta() instead.") @abstractmethod def serialize_annotations( self, @@ -267,6 +269,15 @@ def serialize_annotations( """Serialize the item's annotations.""" ... + @abstractmethod + def serialize_meta( + self, + item: NodeItem, + **kwargs: Any, + ) -> SerializationResult: + """Serialize the item's meta.""" + ... + @abstractmethod def get_excluded_refs(self, **kwargs: Any) -> set[str]: """Get references to excluded items.""" @@ -287,6 +298,26 @@ def get_serializer(self, doc: DoclingDocument) -> BaseDocSerializer: ... +class BaseMetaSerializer(ABC): + """Base class for meta serializers.""" + + @abstractmethod + def serialize( + self, + *, + item: NodeItem, + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + """Serializes the meta of the passed item.""" + ... + + def _humanize_text(self, text: str, title: bool = False) -> str: + tmp = text.replace("__", "_").replace("_", " ") + return tmp.title() if title else tmp.capitalize() + + +@deprecated("Use BaseMetaSerializer() instead.") class BaseAnnotationSerializer(ABC): """Base class for annotation serializers.""" diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py index f5d80af9..4720ada0 100644 --- a/docling_core/transforms/serializer/common.py +++ b/docling_core/transforms/serializer/common.py @@ -4,6 +4,7 @@ # """Define base classes for serialization.""" +import logging import re import sys from abc import abstractmethod @@ -11,7 +12,14 @@ from pathlib import Path from typing import Any, Iterable, Optional, Tuple, Union -from pydantic import AnyUrl, BaseModel, ConfigDict, NonNegativeInt, computed_field +from pydantic import ( + AnyUrl, + BaseModel, + ConfigDict, + Field, + NonNegativeInt, + computed_field, +) from typing_extensions import Self, override from docling_core.transforms.serializer.base import ( @@ -22,6 +30,7 @@ BaseInlineSerializer, BaseKeyValueSerializer, BaseListSerializer, + BaseMetaSerializer, BasePictureSerializer, BaseTableSerializer, BaseTextSerializer, @@ -56,6 +65,9 @@ _DEFAULT_LAYERS = {cl for cl in ContentLayer} +_logger = logging.getLogger(__name__) + + class _PageBreakNode(NodeItem): """Page break node.""" @@ -76,11 +88,11 @@ def _iterate_items( traverse_pictures: bool = False, add_page_breaks: bool = False, visited: Optional[set[str]] = None, -): +) -> Iterable[Tuple[NodeItem, int]]: my_visited: set[str] = visited if visited is not None else set() prev_page_nr: Optional[int] = None page_break_i = 0 - for item, _ in doc.iterate_items( + for item, lvl in doc.iterate_items( root=node, with_groups=True, included_content_layers=layers, @@ -93,7 +105,7 @@ def _iterate_items( ): # if group starts with new page, yield page break before group node my_visited.add(item.self_ref) - for it in _iterate_items( + for it, _ in _iterate_items( doc=doc, layers=layers, node=item, @@ -108,7 +120,7 @@ def _iterate_items( self_ref=f"#/pb/{page_break_i}", prev_page=prev_page_nr, next_page=page_no, - ) + ), lvl break elif isinstance(item, DocItem) and item.prov: page_no = item.prov[0].page_no @@ -118,10 +130,10 @@ def _iterate_items( self_ref=f"#/pb/{page_break_i}", prev_page=prev_page_nr, next_page=page_no, - ) + ), lvl page_break_i += 1 prev_page_nr = page_no - yield item + yield item, lvl def _get_annotation_text( @@ -188,9 +200,22 @@ class CommonParams(BaseModel): start_idx: NonNegativeInt = 0 stop_idx: NonNegativeInt = sys.maxsize + include_non_meta: bool = True + include_formatting: bool = True include_hyperlinks: bool = True caption_delim: str = " " + use_legacy_annotations: bool = Field( + default=False, description="Use legacy annotation serialization." + ) + allowed_meta_names: Optional[set[str]] = Field( + default=None, + description="Meta name to allow; None means all meta names are allowed.", + ) + blocked_meta_names: set[str] = Field( + default_factory=set, + description="Meta name to block; takes precedence over allowed_meta_names.", + ) def merge_with_patch(self, patch: dict[str, Any]) -> Self: """Create an instance by merging the provided patch dict on top of self.""" @@ -215,6 +240,7 @@ class DocSerializer(BaseModel, BaseDocSerializer): list_serializer: BaseListSerializer inline_serializer: BaseInlineSerializer + meta_serializer: Optional[BaseMetaSerializer] = None annotation_serializer: BaseAnnotationSerializer params: CommonParams = CommonParams() @@ -245,7 +271,7 @@ def get_excluded_refs(self, **kwargs: Any) -> set[str]: if refs is None: refs = { item.self_ref - for ix, item in enumerate( + for ix, (item, _) in enumerate( _iterate_items( doc=self.doc, traverse_pictures=True, @@ -301,103 +327,130 @@ def serialize( ) -> SerializationResult: """Serialize a given node.""" my_visited: set[str] = visited if visited is not None else set() + parts: list[SerializationResult] = [] + delim: str = kwargs.get("delim", "\n") + my_params = self.params.model_copy(update=kwargs) my_kwargs = {**self.params.model_dump(), **kwargs} empty_res = create_ser_result() - if item is None or item == self.doc.body: - if self.doc.body.self_ref not in my_visited: - my_visited.add(self.doc.body.self_ref) - return self._serialize_body(**my_kwargs) + + my_item = item or self.doc.body + + if my_item == self.doc.body: + if my_item.meta and not my_params.use_legacy_annotations: + meta_part = self.serialize_meta(item=my_item, **my_kwargs) + if meta_part.text: + parts.append(meta_part) + + if my_item.self_ref not in my_visited: + my_visited.add(my_item.self_ref) + part = self._serialize_body(**my_kwargs) + if part.text: + parts.append(part) + return create_ser_result( + text=delim.join([p.text for p in parts if p.text]), + span_source=parts, + ) else: return empty_res - my_visited.add(item.self_ref) - - ######## - # groups - ######## - if isinstance(item, ListGroup): - part = self.list_serializer.serialize( - item=item, - doc_serializer=self, - doc=self.doc, - list_level=list_level, - is_inline_scope=is_inline_scope, - visited=my_visited, - **my_kwargs, - ) - elif isinstance(item, InlineGroup): - part = self.inline_serializer.serialize( - item=item, - doc_serializer=self, - doc=self.doc, - list_level=list_level, - visited=my_visited, - **my_kwargs, - ) - ########### - # doc items - ########### - elif isinstance(item, TextItem): - if item.self_ref in self._captions_of_some_item: - # those captions will be handled by the floating item holding them - return empty_res - else: - part = ( - self.text_serializer.serialize( - item=item, - doc_serializer=self, - doc=self.doc, - is_inline_scope=is_inline_scope, - visited=my_visited, - **my_kwargs, + my_visited.add(my_item.self_ref) + + if my_item.meta and not my_params.use_legacy_annotations: + meta_part = self.serialize_meta(item=my_item, **my_kwargs) + if meta_part.text: + parts.append(meta_part) + + if my_params.include_non_meta: + ######## + # groups + ######## + if isinstance(my_item, ListGroup): + part = self.list_serializer.serialize( + item=my_item, + doc_serializer=self, + doc=self.doc, + list_level=list_level, + is_inline_scope=is_inline_scope, + visited=my_visited, + **my_kwargs, + ) + elif isinstance(my_item, InlineGroup): + part = self.inline_serializer.serialize( + item=my_item, + doc_serializer=self, + doc=self.doc, + list_level=list_level, + visited=my_visited, + **my_kwargs, + ) + ########### + # doc items + ########### + elif isinstance(my_item, TextItem): + if my_item.self_ref in self._captions_of_some_item: + # those captions will be handled by the floating item holding them + return empty_res + else: + part = ( + self.text_serializer.serialize( + item=my_item, + doc_serializer=self, + doc=self.doc, + is_inline_scope=is_inline_scope, + visited=my_visited, + **my_kwargs, + ) + if my_item.self_ref not in self.get_excluded_refs(**kwargs) + else empty_res ) - if item.self_ref not in self.get_excluded_refs(**kwargs) - else empty_res + elif isinstance(my_item, TableItem): + part = self.table_serializer.serialize( + item=my_item, + doc_serializer=self, + doc=self.doc, + visited=my_visited, + **my_kwargs, ) - elif isinstance(item, TableItem): - part = self.table_serializer.serialize( - item=item, - doc_serializer=self, - doc=self.doc, - visited=my_visited, - **my_kwargs, - ) - elif isinstance(item, PictureItem): - part = self.picture_serializer.serialize( - item=item, - doc_serializer=self, - doc=self.doc, - visited=my_visited, - **my_kwargs, - ) - elif isinstance(item, KeyValueItem): - part = self.key_value_serializer.serialize( - item=item, - doc_serializer=self, - doc=self.doc, - **my_kwargs, - ) - elif isinstance(item, FormItem): - part = self.form_serializer.serialize( - item=item, - doc_serializer=self, - doc=self.doc, - **my_kwargs, - ) - elif isinstance(item, _PageBreakNode): - part = _PageBreakSerResult( - text=self._create_page_break(node=item), - node=item, - ) - else: - part = self.fallback_serializer.serialize( - item=item, - doc_serializer=self, - doc=self.doc, - visited=my_visited, - **my_kwargs, - ) - return part + elif isinstance(my_item, PictureItem): + part = self.picture_serializer.serialize( + item=my_item, + doc_serializer=self, + doc=self.doc, + visited=my_visited, + **my_kwargs, + ) + elif isinstance(my_item, KeyValueItem): + part = self.key_value_serializer.serialize( + item=my_item, + doc_serializer=self, + doc=self.doc, + **my_kwargs, + ) + elif isinstance(my_item, FormItem): + part = self.form_serializer.serialize( + item=my_item, + doc_serializer=self, + doc=self.doc, + **my_kwargs, + ) + elif isinstance(my_item, _PageBreakNode): + part = _PageBreakSerResult( + text=self._create_page_break(node=my_item), + node=my_item, + ) + else: + part = self.fallback_serializer.serialize( + item=my_item, + doc_serializer=self, + doc=self.doc, + visited=my_visited, + **my_kwargs, + ) + parts.append(part) + + return create_ser_result( + text=delim.join([p.text for p in parts if p.text]), span_source=parts + ) # making some assumptions about the kwargs it can pass @override @@ -416,7 +469,7 @@ def get_parts( my_visited: set[str] = visited if visited is not None else set() params = self.params.merge_with_patch(patch=kwargs) - for node in _iterate_items( + for node, lvl in _iterate_items( node=item, doc=self.doc, layers=params.layers, @@ -426,15 +479,17 @@ def get_parts( continue else: my_visited.add(node.self_ref) + part = self.serialize( item=node, list_level=list_level, is_inline_scope=is_inline_scope, visited=my_visited, - **kwargs, + **(dict(level=lvl) | kwargs), ) if part.text: parts.append(part) + return parts @override @@ -528,6 +583,31 @@ def serialize_captions( text_res = "" return create_ser_result(text=text_res, span_source=results) + @override + def serialize_meta( + self, + item: NodeItem, + **kwargs: Any, + ) -> SerializationResult: + """Serialize the item's meta.""" + if self.meta_serializer: + if item.self_ref not in self.get_excluded_refs(**kwargs): + return self.meta_serializer.serialize( + item=item, + doc=self.doc, + **(self.params.model_dump() | kwargs), + ) + else: + return create_ser_result( + text="", span_source=item if isinstance(item, DocItem) else [] + ) + else: + _logger.warning("No meta serializer found.") + return create_ser_result( + text="", span_source=item if isinstance(item, DocItem) else [] + ) + + # TODO deprecate @override def serialize_annotations( self, diff --git a/docling_core/transforms/serializer/doctags.py b/docling_core/transforms/serializer/doctags.py index 844d0096..0195cd8e 100644 --- a/docling_core/transforms/serializer/doctags.py +++ b/docling_core/transforms/serializer/doctags.py @@ -44,6 +44,7 @@ PictureTabularChartData, ProvenanceItem, SectionHeaderItem, + TableData, TableItem, TextItem, ) @@ -233,13 +234,22 @@ def serialize( ysize=params.ysize, ) - classifications = [ - ann - for ann in item.annotations - if isinstance(ann, PictureClassificationData) - ] - if len(classifications) > 0: + # handle classification data + predicted_class: Optional[str] = None + if item.meta and item.meta.classification: + predicted_class = ( + item.meta.classification.get_main_prediction().class_name + ) + elif ( + classifications := [ + ann + for ann in item.annotations + if isinstance(ann, PictureClassificationData) + ] + ) and classifications[0].predicted_classes: predicted_class = classifications[0].predicted_classes[0].class_name + if predicted_class: + body += DocumentToken.get_picture_classification_token(predicted_class) if predicted_class in [ PictureClassificationLabel.PIE_CHART, PictureClassificationLabel.BAR_CHART, @@ -250,26 +260,31 @@ def serialize( PictureClassificationLabel.HEATMAP, ]: is_chart = True - body += DocumentToken.get_picture_classification_token(predicted_class) - smiles_annotations = [ + # handle molecule data + smi: Optional[str] = None + if item.meta and item.meta.molecule: + smi = item.meta.molecule.smi + elif smiles_annotations := [ ann for ann in item.annotations if isinstance(ann, PictureMoleculeData) - ] - if len(smiles_annotations) > 0: - body += _wrap( - text=smiles_annotations[0].smi, wrap_tag=DocumentToken.SMILES.value - ) - - tabular_chart_annotations = [ + ]: + smi = smiles_annotations[0].smi + if smi: + body += _wrap(text=smi, wrap_tag=DocumentToken.SMILES.value) + + # handle tabular chart data + chart_data: Optional[TableData] = None + if item.meta and item.meta.tabular_chart: + chart_data = item.meta.tabular_chart.chart_data + elif tabular_chart_annotations := [ ann for ann in item.annotations if isinstance(ann, PictureTabularChartData) - ] - if len(tabular_chart_annotations) > 0: + ]: + chart_data = tabular_chart_annotations[0].chart_data + if chart_data and chart_data.table_cells: temp_doc = DoclingDocument(name="temp") - temp_table = temp_doc.add_table( - data=tabular_chart_annotations[0].chart_data - ) + temp_table = temp_doc.add_table(data=chart_data) otsl_content = temp_table.export_to_otsl( temp_doc, add_cell_location=False ) diff --git a/docling_core/transforms/serializer/html.py b/docling_core/transforms/serializer/html.py index 98e5cf7d..71bd798e 100644 --- a/docling_core/transforms/serializer/html.py +++ b/docling_core/transforms/serializer/html.py @@ -17,7 +17,7 @@ import latex2mathml.converter from PIL.Image import Image -from pydantic import AnyUrl, BaseModel +from pydantic import AnyUrl, BaseModel, Field from typing_extensions import override from docling_core.transforms.serializer.base import ( @@ -28,6 +28,7 @@ BaseInlineSerializer, BaseKeyValueSerializer, BaseListSerializer, + BaseMetaSerializer, BasePictureSerializer, BaseTableSerializer, BaseTextSerializer, @@ -46,9 +47,11 @@ from docling_core.transforms.visualizer.base import BaseVisualizer from docling_core.types.doc.base import ImageRefMode from docling_core.types.doc.document import ( + BaseMeta, CodeItem, ContentLayer, DescriptionAnnotation, + DescriptionMetaField, DocItem, DoclingDocument, FloatingItem, @@ -61,14 +64,18 @@ KeyValueItem, ListGroup, ListItem, + MoleculeMetaField, NodeItem, PictureClassificationData, + PictureClassificationMetaField, PictureItem, PictureMoleculeData, PictureTabularChartData, RichTableCell, SectionHeaderItem, + SummaryMetaField, TableItem, + TabularChartMetaField, TextItem, TitleItem, ) @@ -115,7 +122,11 @@ class HTMLParams(CommonParams): # Enable charts to be printed into HTML as tables enable_chart_tables: bool = True - include_annotations: bool = True + include_annotations: bool = Field( + default=True, + description="Include item annotations.", + deprecated="Use include_meta instead.", + ) show_original_list_item_marker: bool = True @@ -808,6 +819,65 @@ def serialize( ) +class HTMLMetaSerializer(BaseModel, BaseMetaSerializer): + """HTML-specific meta serializer.""" + + @override + def serialize( + self, + *, + item: NodeItem, + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + """Serialize the item's meta.""" + params = HTMLParams(**kwargs) + return create_ser_result( + text="\n".join( + [ + tmp + for key in ( + list(item.meta.__class__.model_fields) + + list(item.meta.get_custom_part()) + ) + if ( + ( + params.allowed_meta_names is None + or key in params.allowed_meta_names + ) + and (key not in params.blocked_meta_names) + and (tmp := self._serialize_meta_field(item.meta, key)) + ) + ] + if item.meta + else [] + ), + span_source=item if isinstance(item, DocItem) else [], + # NOTE for now using an empty span source for GroupItems + ) + + def _serialize_meta_field(self, meta: BaseMeta, name: str) -> Optional[str]: + if (field_val := getattr(meta, name)) is not None: + if isinstance(field_val, SummaryMetaField): + txt = field_val.text + elif isinstance(field_val, DescriptionMetaField): + txt = field_val.text + elif isinstance(field_val, PictureClassificationMetaField): + txt = self._humanize_text(field_val.get_main_prediction().class_name) + elif isinstance(field_val, MoleculeMetaField): + txt = field_val.smi + elif isinstance(field_val, TabularChartMetaField): + # suppressing tabular chart serialization + return None + elif tmp := str(field_val or ""): + txt = tmp + else: + return None + return f"
{txt}
" + else: + return None + + class HTMLAnnotationSerializer(BaseModel, BaseAnnotationSerializer): """HTML-specific annotation serializer.""" @@ -858,6 +928,7 @@ class HTMLDocSerializer(DocSerializer): list_serializer: BaseListSerializer = HTMLListSerializer() inline_serializer: BaseInlineSerializer = HTMLInlineSerializer() + meta_serializer: BaseMetaSerializer = HTMLMetaSerializer() annotation_serializer: BaseAnnotationSerializer = HTMLAnnotationSerializer() params: HTMLParams = HTMLParams() @@ -1047,7 +1118,11 @@ def serialize_captions( ) results.append(cap_ser_res) - if params.include_annotations and item.self_ref not in excluded_refs: + if ( + params.use_legacy_annotations + and params.include_annotations + and item.self_ref not in excluded_refs + ): if isinstance(item, (PictureItem, TableItem)): ann_res = self.serialize_annotations( item=item, diff --git a/docling_core/transforms/serializer/markdown.py b/docling_core/transforms/serializer/markdown.py index d0908270..6292761d 100644 --- a/docling_core/transforms/serializer/markdown.py +++ b/docling_core/transforms/serializer/markdown.py @@ -11,7 +11,7 @@ from pathlib import Path from typing import Any, Optional, Union -from pydantic import AnyUrl, BaseModel, PositiveInt +from pydantic import AnyUrl, BaseModel, Field, PositiveInt from tabulate import tabulate from typing_extensions import override @@ -23,6 +23,7 @@ BaseInlineSerializer, BaseKeyValueSerializer, BaseListSerializer, + BaseMetaSerializer, BasePictureSerializer, BaseTableSerializer, BaseTextSerializer, @@ -36,9 +37,11 @@ ) from docling_core.types.doc.base import ImageRefMode from docling_core.types.doc.document import ( + BaseMeta, CodeItem, ContentLayer, DescriptionAnnotation, + DescriptionMetaField, DocItem, DocItemLabel, DoclingDocument, @@ -52,14 +55,18 @@ KeyValueItem, ListGroup, ListItem, + MoleculeMetaField, NodeItem, PictureClassificationData, + PictureClassificationMetaField, PictureItem, PictureMoleculeData, PictureTabularChartData, RichTableCell, SectionHeaderItem, + SummaryMetaField, TableItem, + TabularChartMetaField, TextItem, TitleItem, ) @@ -102,8 +109,17 @@ class MarkdownParams(CommonParams): page_break_placeholder: Optional[str] = None # e.g. "" escape_underscores: bool = True escape_html: bool = True - include_annotations: bool = True - mark_annotations: bool = False + mark_meta: bool = Field(default=False, description="Mark meta sections.") + include_annotations: bool = Field( + default=True, + description="Include item annotations.", + deprecated="Use include_meta instead.", + ) + mark_annotations: bool = Field( + default=False, + description="Mark annotation sections.", + deprecated="Use mark_meta instead.", + ) orig_list_item_marker_mode: OrigListItemMarkerMode = OrigListItemMarkerMode.AUTO ensure_valid_list_item_marker: bool = True @@ -245,9 +261,77 @@ def serialize( return create_ser_result(text=text, span_source=res_parts) +class MarkdownMetaSerializer(BaseModel, BaseMetaSerializer): + """Markdown-specific meta serializer.""" + + @override + def serialize( + self, + *, + item: NodeItem, + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + """Serialize the item's meta.""" + params = MarkdownParams(**kwargs) + return create_ser_result( + text="\n\n".join( + [ + tmp + for key in ( + list(item.meta.__class__.model_fields) + + list(item.meta.get_custom_part()) + ) + if ( + ( + params.allowed_meta_names is None + or key in params.allowed_meta_names + ) + and (key not in params.blocked_meta_names) + and ( + tmp := self._serialize_meta_field( + item.meta, key, params.mark_meta + ) + ) + ) + ] + if item.meta + else [] + ), + span_source=item if isinstance(item, DocItem) else [], + # NOTE for now using an empty span source for GroupItems + ) + + def _serialize_meta_field( + self, meta: BaseMeta, name: str, mark_meta: bool + ) -> Optional[str]: + if (field_val := getattr(meta, name)) is not None: + if isinstance(field_val, SummaryMetaField): + txt = field_val.text + elif isinstance(field_val, DescriptionMetaField): + txt = field_val.text + elif isinstance(field_val, PictureClassificationMetaField): + txt = self._humanize_text(field_val.get_main_prediction().class_name) + elif isinstance(field_val, MoleculeMetaField): + txt = field_val.smi + elif isinstance(field_val, TabularChartMetaField): + # suppressing tabular chart serialization + return None + elif tmp := str(field_val or ""): + txt = tmp + else: + return None + return ( + f"[{self._humanize_text(name, title=True)}] {txt}" if mark_meta else txt + ) + else: + return None + + class MarkdownAnnotationSerializer(BaseModel, BaseAnnotationSerializer): """Markdown-specific annotation serializer.""" + @override def serialize( self, *, @@ -313,7 +397,7 @@ def serialize( if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs): - if params.include_annotations: + if params.use_legacy_annotations and params.include_annotations: ann_res = doc_serializer.serialize_annotations( item=item, @@ -382,7 +466,7 @@ def serialize( res_parts.append(cap_res) if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs): - if params.include_annotations: + if params.use_legacy_annotations and params.include_annotations: ann_res = doc_serializer.serialize_annotations( item=item, **kwargs, @@ -629,6 +713,7 @@ class MarkdownDocSerializer(DocSerializer): list_serializer: BaseListSerializer = MarkdownListSerializer() inline_serializer: BaseInlineSerializer = MarkdownInlineSerializer() + meta_serializer: BaseMetaSerializer = MarkdownMetaSerializer() annotation_serializer: BaseAnnotationSerializer = MarkdownAnnotationSerializer() params: MarkdownParams = MarkdownParams() @@ -727,3 +812,22 @@ def serialize_doc( def requires_page_break(self) -> bool: """Whether to add page breaks.""" return self.params.page_break_placeholder is not None + + @override + def serialize( + self, + *, + item: Optional[NodeItem] = None, + list_level: int = 0, + is_inline_scope: bool = False, + visited: Optional[set[str]] = None, + **kwargs: Any, + ) -> SerializationResult: + """Serialize a given node.""" + return super().serialize( + item=item, + list_level=list_level, + is_inline_scope=is_inline_scope, + visited=visited, + **(dict(delim="\n\n") | kwargs), + ) diff --git a/docling_core/types/doc/__init__.py b/docling_core/types/doc/__init__.py index 25b5a869..cf0edfef 100644 --- a/docling_core/types/doc/__init__.py +++ b/docling_core/types/doc/__init__.py @@ -9,6 +9,8 @@ from .document import ( AnyTableCell, BaseAnnotation, + BaseMeta, + BasePrediction, ChartBar, ChartLine, ChartPoint, @@ -17,12 +19,14 @@ CodeItem, ContentLayer, DescriptionAnnotation, + DescriptionMetaField, DocItem, DoclingDocument, DocTagsDocument, DocTagsPage, DocumentOrigin, FloatingItem, + FloatingMeta, Formatting, FormItem, FormulaItem, @@ -35,7 +39,10 @@ KeyValueItem, ListGroup, ListItem, + MetaFieldName, + MetaUtils, MiscAnnotation, + MoleculeMetaField, NodeItem, OrderedList, PageItem, @@ -43,9 +50,11 @@ PictureChartData, PictureClassificationClass, PictureClassificationData, + PictureClassificationMetaField, PictureDataType, PictureItem, PictureLineChartData, + PictureMeta, PictureMoleculeData, PicturePieChartData, PictureScatterChartData, @@ -56,9 +65,11 @@ RichTableCell, Script, SectionHeaderItem, + SummaryMetaField, TableCell, TableData, TableItem, + TabularChartMetaField, TextItem, TitleItem, UnorderedList, diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 45d8611b..626a9734 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -27,6 +27,8 @@ Field, FieldSerializationInfo, StringConstraints, + TypeAdapter, + ValidationError, computed_field, field_serializer, field_validator, @@ -60,7 +62,7 @@ Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))] LevelNumber = typing.Annotated[int, Field(ge=1, le=100)] -CURRENT_VERSION: Final = "1.7.0" +CURRENT_VERSION: Final = "1.8.0" DEFAULT_EXPORT_LABELS = { DocItemLabel.TITLE, @@ -941,6 +943,156 @@ class ContentLayer(str, Enum): DEFAULT_CONTENT_LAYERS = {ContentLayer.BODY} +class _ExtraAllowingModel(BaseModel): + """Base model allowing extra fields.""" + + model_config = ConfigDict(extra="allow") + + def get_custom_part(self) -> dict[str, Any]: + """Get the extra fields as a dictionary.""" + return self.__pydantic_extra__ or {} + + def _copy_without_extra(self) -> Self: + """Create a copy without the extra fields.""" + return self.model_validate( + self.model_dump(exclude={ex for ex in self.get_custom_part()}) + ) + + def _check_custom_field_format(self, key: str) -> None: + parts = key.split(MetaUtils._META_FIELD_NAMESPACE_DELIMITER, maxsplit=1) + if len(parts) != 2 or (not parts[0]) or (not parts[1]): + raise ValueError( + f"Custom meta field name must be in format 'namespace__field_name' (e.g. 'my_corp__max_size'): {key}" + ) + + @model_validator(mode="after") + def _validate_field_names(self) -> Self: + extra_dict = self.get_custom_part() + for key in self.model_dump(): + if key in extra_dict: + self._check_custom_field_format(key=key) + elif MetaUtils._META_FIELD_NAMESPACE_DELIMITER in key: + raise ValueError( + f"Standard meta field name must not contain '__': {key}" + ) + + return self + + def __setattr__(self, name: str, value: Any) -> None: + super().__setattr__(name, value) + if name in self.get_custom_part(): + self._check_custom_field_format(key=name) + + def set_custom_field(self, namespace: str, name: str, value: Any) -> str: + """Set a custom field and return the key.""" + key = MetaUtils.create_meta_field_name(namespace=namespace, name=name) + setattr(self, key, value) + return key + + +class BasePrediction(_ExtraAllowingModel): + """Prediction field.""" + + confidence: Optional[float] = Field( + default=None, + ge=0, + le=1, + description="The confidence of the prediction.", + examples=[0.9, 0.42], + ) + created_by: Optional[str] = Field( + default=None, + description="The origin of the prediction.", + examples=["ibm-granite/granite-docling-258M"], + ) + + @field_serializer("confidence") + def _serialize(self, value: float, info: FieldSerializationInfo) -> float: + return round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC) + + +class SummaryMetaField(BasePrediction): + """Summary data.""" + + text: str + + +# NOTE: must be manually kept in sync with top-level BaseMeta hierarchy fields +class MetaFieldName(str, Enum): + """Standard meta field names.""" + + SUMMARY = "summary" # a summary of the tree under this node + DESCRIPTION = "description" # a description of the node (e.g. for images) + CLASSIFICATION = "classification" # a classification of the node content + MOLECULE = "molecule" # molecule data + TABULAR_CHART = "tabular_chart" # tabular chart data + + +class BaseMeta(_ExtraAllowingModel): + """Base class for metadata.""" + + summary: Optional[SummaryMetaField] = None + + +class DescriptionMetaField(BasePrediction): + """Description metadata field.""" + + text: str + + +class PictureClassificationPrediction(BasePrediction): + """Picture classification instance.""" + + class_name: str + + +class PictureClassificationMetaField(_ExtraAllowingModel): + """Picture classification metadata field.""" + + predictions: list[PictureClassificationPrediction] = Field( + default_factory=list, min_length=1 + ) + + def get_main_prediction(self) -> PictureClassificationPrediction: + """Get prediction with highest confidence (if confidence not available, first is used by convention).""" + max_conf_pos: Optional[int] = None + max_conf: Optional[float] = None + for i, pred in enumerate(self.predictions): + if pred.confidence is not None and ( + max_conf is None or pred.confidence > max_conf + ): + max_conf_pos = i + max_conf = pred.confidence + return self.predictions[max_conf_pos if max_conf_pos is not None else 0] + + +class MoleculeMetaField(BasePrediction): + """Molecule metadata field.""" + + smi: str = Field(description="The SMILES representation of the molecule.") + + +class TabularChartMetaField(BasePrediction): + """Tabular chart metadata field.""" + + title: Optional[str] = None + chart_data: TableData + + +class FloatingMeta(BaseMeta): + """Metadata model for floating.""" + + description: Optional[DescriptionMetaField] = None + + +class PictureMeta(FloatingMeta): + """Metadata model for pictures.""" + + classification: Optional[PictureClassificationMetaField] = None + molecule: Optional[MoleculeMetaField] = None + tabular_chart: Optional[TabularChartMetaField] = None + + class NodeItem(BaseModel): """NodeItem.""" @@ -952,6 +1104,8 @@ class NodeItem(BaseModel): model_config = ConfigDict(extra="forbid") + meta: Optional[BaseMeta] = None + def get_ref(self) -> RefItem: """get_ref.""" return RefItem(cref=self.self_ref) @@ -1312,6 +1466,8 @@ class ListItem(TextItem): class FloatingItem(DocItem): """FloatingItem.""" + meta: Optional[FloatingMeta] = None + captions: List[RefItem] = [] references: List[RefItem] = [] footnotes: List[RefItem] = [] @@ -1399,6 +1555,33 @@ class FormulaItem(TextItem): ) +class MetaUtils: + """Metadata-related utilities.""" + + _META_FIELD_NAMESPACE_DELIMITER: Final = "__" + _META_FIELD_LEGACY_NAMESPACE: Final = "docling_legacy" + + @classmethod + def create_meta_field_name( + cls, + *, + namespace: str, + name: str, + ) -> str: + """Create a meta field name.""" + return f"{namespace}{cls._META_FIELD_NAMESPACE_DELIMITER}{name}" + + @classmethod + def _create_migrated_meta_field_name( + cls, + *, + name: str, + ) -> str: + return cls.create_meta_field_name( + namespace=cls._META_FIELD_LEGACY_NAMESPACE, name=name + ) + + class PictureItem(FloatingItem): """PictureItem.""" @@ -1406,7 +1589,94 @@ class PictureItem(FloatingItem): DocItemLabel.PICTURE ) - annotations: List[PictureDataType] = [] + meta: Optional[PictureMeta] = None + annotations: Annotated[ + List[PictureDataType], + deprecated("Field `annotations` is deprecated; use `meta` instead."), + ] = [] + + @model_validator(mode="before") + @classmethod + def _migrate_annotations_to_meta(cls, data: Any) -> Any: + """Migrate the `annotations` field to `meta`.""" + if isinstance(data, dict) and (annotations := data.get("annotations")): + _logger.warning( + "Migrating deprecated `annotations` to `meta`; this will be removed in the future. " + "Note that only the first available instance of each annotation type will be migrated." + ) + for raw_ann in annotations: + # migrate annotations to meta + + try: + ann: PictureDataType = TypeAdapter(PictureDataType).validate_python( + raw_ann + ) + except ValidationError as e: + raise e + + # ensure meta field is present + data.setdefault("meta", {}) + + if isinstance(ann, PictureClassificationData): + data["meta"].setdefault( + MetaFieldName.CLASSIFICATION.value, + PictureClassificationMetaField( + predictions=[ + PictureClassificationPrediction( + class_name=pred.class_name, + confidence=pred.confidence, + created_by=ann.provenance, + ) + for pred in ann.predicted_classes + ], + ).model_dump(mode="json"), + ) + elif isinstance(ann, DescriptionAnnotation): + data["meta"].setdefault( + MetaFieldName.DESCRIPTION.value, + DescriptionMetaField( + text=ann.text, + created_by=ann.provenance, + ).model_dump(mode="json"), + ) + elif isinstance(ann, PictureMoleculeData): + data["meta"].setdefault( + MetaFieldName.MOLECULE.value, + MoleculeMetaField( + smi=ann.smi, + confidence=ann.confidence, + created_by=ann.provenance, + **{ + MetaUtils._create_migrated_meta_field_name( + name="segmentation" + ): ann.segmentation, + MetaUtils._create_migrated_meta_field_name( + name="class_name" + ): ann.class_name, + }, + ).model_dump(mode="json"), + ) + elif isinstance(ann, PictureTabularChartData): + data["meta"].setdefault( + MetaFieldName.TABULAR_CHART.value, + TabularChartMetaField( + title=ann.title, + chart_data=ann.chart_data, + ).model_dump(mode="json"), + ) + elif isinstance(ann, MiscAnnotation): + data["meta"].setdefault( + MetaUtils._create_migrated_meta_field_name(name=ann.kind), + ann.content, + ) + else: + # fall back to reusing original annotation type name (in namespaced format) + data["meta"].setdefault( + MetaUtils._create_migrated_meta_field_name(name=ann.kind), + ann.model_dump(mode="json"), + ) + + return data # Convert the image to Base64 def _image_to_base64(self, pil_image, format="PNG"): @@ -1554,7 +1824,54 @@ class TableItem(FloatingItem): DocItemLabel.TABLE, ] = DocItemLabel.TABLE - annotations: List[TableAnnotationType] = [] + annotations: Annotated[ + List[TableAnnotationType], + deprecated("Field `annotations` is deprecated; use `meta` instead."), + ] = [] + + @model_validator(mode="before") + @classmethod + def migrate_annotations_to_meta(cls, data: Any) -> Any: + """Migrate the `annotations` field to `meta`.""" + if isinstance(data, dict) and (annotations := data.get("annotations")): + _logger.warning( + "Migrating deprecated `annotations` to `meta`; this will be removed in the future. " + "Note that only the first available instance of each annotation type will be migrated." + ) + for raw_ann in annotations: + # migrate annotations to meta + + try: + ann: TableAnnotationType = TypeAdapter( + TableAnnotationType + ).validate_python(raw_ann) + except ValidationError as e: + raise e + + # ensure meta field is present + data.setdefault("meta", {}) + + if isinstance(ann, DescriptionAnnotation): + data["meta"].setdefault( + MetaFieldName.DESCRIPTION.value, + DescriptionMetaField( + text=ann.text, + created_by=ann.provenance, + ).model_dump(mode="json"), + ) + elif isinstance(ann, MiscAnnotation): + data["meta"].setdefault( + MetaUtils._create_migrated_meta_field_name(name=ann.kind), + ann.content, + ) + else: + # fall back to reusing original annotation type name (in namespaced format) + data["meta"].setdefault( + MetaUtils._create_migrated_meta_field_name(name=ann.kind), + ann.model_dump(mode="json"), + ) + + return data def export_to_dataframe( self, doc: Optional["DoclingDocument"] = None @@ -4396,6 +4713,9 @@ def save_as_markdown( included_content_layers: Optional[set[ContentLayer]] = None, page_break_placeholder: Optional[str] = None, include_annotations: bool = True, + *, + mark_meta: bool = False, + use_legacy_annotations: bool = False, ): """Save to markdown.""" if isinstance(filename, str): @@ -4425,6 +4745,8 @@ def save_as_markdown( included_content_layers=included_content_layers, page_break_placeholder=page_break_placeholder, include_annotations=include_annotations, + use_legacy_annotations=use_legacy_annotations, + mark_meta=mark_meta, ) with open(filename, "w", encoding="utf-8") as fw: @@ -4449,6 +4771,11 @@ def export_to_markdown( # noqa: C901 page_break_placeholder: Optional[str] = None, # e.g. "", include_annotations: bool = True, mark_annotations: bool = False, + *, + use_legacy_annotations: bool = False, + allowed_meta_names: Optional[set[str]] = None, + blocked_meta_names: Optional[set[str]] = None, + mark_meta: bool = False, ) -> str: r"""Serialize to Markdown. @@ -4494,8 +4821,18 @@ def export_to_markdown( # noqa: C901 :param mark_annotations: bool: Whether to mark annotations in the export; only relevant if include_annotations is True. (Default value = False). :type mark_annotations: bool = False + :param use_legacy_annotations: bool: Whether to use legacy annotation serialization. + (Default value = False). + :type use_legacy_annotations: bool = False + :param mark_meta: bool: Whether to mark meta in the export; only + relevant if use_legacy_annotations is False. (Default value = False). + :type mark_meta: bool = False :returns: The exported Markdown representation. :rtype: str + :param allowed_meta_names: Optional[set[str]]: Meta names to allow; None means all meta names are allowed. + :type allowed_meta_names: Optional[set[str]] = None + :param blocked_meta_names: Optional[set[str]]: Meta names to block; takes precedence over allowed_meta_names. + :type blocked_meta_names: Optional[set[str]] = None """ from docling_core.transforms.serializer.markdown import ( MarkdownDocSerializer, @@ -4524,7 +4861,11 @@ def export_to_markdown( # noqa: C901 indent=indent, wrap_width=text_width if text_width > 0 else None, page_break_placeholder=page_break_placeholder, + mark_meta=mark_meta, include_annotations=include_annotations, + use_legacy_annotations=use_legacy_annotations, + allowed_meta_names=allowed_meta_names, + blocked_meta_names=blocked_meta_names or set(), mark_annotations=mark_annotations, ), ) @@ -5530,16 +5871,17 @@ def check_version_is_compatible(cls, v: str) -> str: return CURRENT_VERSION @model_validator(mode="after") # type: ignore - @classmethod - def validate_document(cls, d: "DoclingDocument"): + def validate_document(self) -> Self: """validate_document.""" with warnings.catch_warnings(): # ignore warning from deprecated furniture warnings.filterwarnings("ignore", category=DeprecationWarning) - if not d.validate_tree(d.body) or not d.validate_tree(d.furniture): + if not self.validate_tree(self.body) or not self.validate_tree( + self.furniture + ): raise ValueError("Document hierachy is inconsistent.") - return d + return self @model_validator(mode="after") def validate_misplaced_list_items(self): @@ -5746,6 +6088,13 @@ def concatenate(cls, docs: Sequence["DoclingDocument"]) -> "DoclingDocument": return res_doc def _validate_rules(self): + + def validate_furniture(doc: DoclingDocument): + if doc.furniture.children: + raise ValueError( + f"Deprecated furniture node {doc.furniture.self_ref} has children" + ) + def validate_list_group(doc: DoclingDocument, item: ListGroup): for ref in item.children: child = ref.resolve(doc) @@ -5768,6 +6117,8 @@ def validate_group(doc: DoclingDocument, item: GroupItem): ): # tolerate empty body, but not other groups raise ValueError(f"Group {item.self_ref} has no children") + validate_furniture(self) + for item, _ in self.iterate_items( with_groups=True, traverse_pictures=True, diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index 305f5a9b..a99785ff 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -1,5 +1,24 @@ { "$defs": { + "BaseMeta": { + "additionalProperties": true, + "description": "Base class for metadata.", + "properties": { + "summary": { + "anyOf": [ + { + "$ref": "#/$defs/SummaryMetaField" + }, + { + "type": "null" + } + ], + "default": null + } + }, + "title": "BaseMeta", + "type": "object" + }, "BoundingBox": { "description": "BoundingBox.", "properties": { @@ -194,6 +213,17 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "meta": { + "anyOf": [ + { + "$ref": "#/$defs/FloatingMeta" + }, + { + "type": "null" + } + ], + "default": null + }, "label": { "const": "code", "default": "code", @@ -403,6 +433,56 @@ "title": "DescriptionAnnotation", "type": "object" }, + "DescriptionMetaField": { + "additionalProperties": true, + "description": "Description metadata field.", + "properties": { + "confidence": { + "anyOf": [ + { + "maximum": 1, + "minimum": 0, + "type": "number" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The confidence of the prediction.", + "examples": [ + 0.9, + 0.42 + ], + "title": "Confidence" + }, + "created_by": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The origin of the prediction.", + "examples": [ + "ibm-granite/granite-docling-258M" + ], + "title": "Created By" + }, + "text": { + "title": "Text", + "type": "string" + } + }, + "required": [ + "text" + ], + "title": "DescriptionMetaField", + "type": "object" + }, "DocumentOrigin": { "description": "FileSource.", "properties": { @@ -443,6 +523,36 @@ "title": "DocumentOrigin", "type": "object" }, + "FloatingMeta": { + "additionalProperties": true, + "description": "Metadata model for floating.", + "properties": { + "summary": { + "anyOf": [ + { + "$ref": "#/$defs/SummaryMetaField" + }, + { + "type": "null" + } + ], + "default": null + }, + "description": { + "anyOf": [ + { + "$ref": "#/$defs/DescriptionMetaField" + }, + { + "type": "null" + } + ], + "default": null + } + }, + "title": "FloatingMeta", + "type": "object" + }, "FormItem": { "additionalProperties": false, "description": "FormItem.", @@ -475,6 +585,17 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "meta": { + "anyOf": [ + { + "$ref": "#/$defs/FloatingMeta" + }, + { + "type": "null" + } + ], + "default": null + }, "label": { "const": "form", "default": "form", @@ -598,6 +719,17 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "meta": { + "anyOf": [ + { + "$ref": "#/$defs/BaseMeta" + }, + { + "type": "null" + } + ], + "default": null + }, "label": { "const": "formula", "default": "formula", @@ -807,6 +939,17 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "meta": { + "anyOf": [ + { + "$ref": "#/$defs/BaseMeta" + }, + { + "type": "null" + } + ], + "default": null + }, "name": { "default": "group", "title": "Name", @@ -912,6 +1055,17 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "meta": { + "anyOf": [ + { + "$ref": "#/$defs/BaseMeta" + }, + { + "type": "null" + } + ], + "default": null + }, "name": { "default": "group", "title": "Name", @@ -962,6 +1116,17 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "meta": { + "anyOf": [ + { + "$ref": "#/$defs/FloatingMeta" + }, + { + "type": "null" + } + ], + "default": null + }, "label": { "const": "key_value_region", "default": "key_value_region", @@ -1054,6 +1219,17 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "meta": { + "anyOf": [ + { + "$ref": "#/$defs/BaseMeta" + }, + { + "type": "null" + } + ], + "default": null + }, "name": { "default": "group", "title": "Name", @@ -1104,6 +1280,17 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "meta": { + "anyOf": [ + { + "$ref": "#/$defs/BaseMeta" + }, + { + "type": "null" + } + ], + "default": null + }, "label": { "const": "list_item", "default": "list_item", @@ -1195,6 +1382,57 @@ "title": "MiscAnnotation", "type": "object" }, + "MoleculeMetaField": { + "additionalProperties": true, + "description": "Molecule metadata field.", + "properties": { + "confidence": { + "anyOf": [ + { + "maximum": 1, + "minimum": 0, + "type": "number" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The confidence of the prediction.", + "examples": [ + 0.9, + 0.42 + ], + "title": "Confidence" + }, + "created_by": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The origin of the prediction.", + "examples": [ + "ibm-granite/granite-docling-258M" + ], + "title": "Created By" + }, + "smi": { + "description": "The SMILES representation of the molecule.", + "title": "Smi", + "type": "string" + } + }, + "required": [ + "smi" + ], + "title": "MoleculeMetaField", + "type": "object" + }, "PageItem": { "description": "PageItem.", "properties": { @@ -1309,6 +1547,72 @@ "title": "PictureClassificationData", "type": "object" }, + "PictureClassificationMetaField": { + "additionalProperties": true, + "description": "Picture classification metadata field.", + "properties": { + "predictions": { + "items": { + "$ref": "#/$defs/PictureClassificationPrediction" + }, + "minItems": 1, + "title": "Predictions", + "type": "array" + } + }, + "title": "PictureClassificationMetaField", + "type": "object" + }, + "PictureClassificationPrediction": { + "additionalProperties": true, + "description": "Picture classification instance.", + "properties": { + "confidence": { + "anyOf": [ + { + "maximum": 1, + "minimum": 0, + "type": "number" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The confidence of the prediction.", + "examples": [ + 0.9, + 0.42 + ], + "title": "Confidence" + }, + "created_by": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The origin of the prediction.", + "examples": [ + "ibm-granite/granite-docling-258M" + ], + "title": "Created By" + }, + "class_name": { + "title": "Class Name", + "type": "string" + } + }, + "required": [ + "class_name" + ], + "title": "PictureClassificationPrediction", + "type": "object" + }, "PictureItem": { "additionalProperties": false, "description": "PictureItem.", @@ -1341,6 +1645,17 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "meta": { + "anyOf": [ + { + "$ref": "#/$defs/PictureMeta" + }, + { + "type": "null" + } + ], + "default": null + }, "label": { "default": "picture", "enum": [ @@ -1395,6 +1710,7 @@ }, "annotations": { "default": [], + "deprecated": true, "items": { "discriminator": { "mapping": { @@ -1492,6 +1808,69 @@ "title": "PictureLineChartData", "type": "object" }, + "PictureMeta": { + "additionalProperties": true, + "description": "Metadata model for pictures.", + "properties": { + "summary": { + "anyOf": [ + { + "$ref": "#/$defs/SummaryMetaField" + }, + { + "type": "null" + } + ], + "default": null + }, + "description": { + "anyOf": [ + { + "$ref": "#/$defs/DescriptionMetaField" + }, + { + "type": "null" + } + ], + "default": null + }, + "classification": { + "anyOf": [ + { + "$ref": "#/$defs/PictureClassificationMetaField" + }, + { + "type": "null" + } + ], + "default": null + }, + "molecule": { + "anyOf": [ + { + "$ref": "#/$defs/MoleculeMetaField" + }, + { + "type": "null" + } + ], + "default": null + }, + "tabular_chart": { + "anyOf": [ + { + "$ref": "#/$defs/TabularChartMetaField" + }, + { + "type": "null" + } + ], + "default": null + } + }, + "title": "PictureMeta", + "type": "object" + }, "PictureMoleculeData": { "description": "PictureMoleculeData.", "properties": { @@ -1842,6 +2221,17 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "meta": { + "anyOf": [ + { + "$ref": "#/$defs/BaseMeta" + }, + { + "type": "null" + } + ], + "default": null + }, "label": { "const": "section_header", "default": "section_header", @@ -1926,6 +2316,56 @@ "title": "Size", "type": "object" }, + "SummaryMetaField": { + "additionalProperties": true, + "description": "Summary data.", + "properties": { + "confidence": { + "anyOf": [ + { + "maximum": 1, + "minimum": 0, + "type": "number" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The confidence of the prediction.", + "examples": [ + 0.9, + 0.42 + ], + "title": "Confidence" + }, + "created_by": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The origin of the prediction.", + "examples": [ + "ibm-granite/granite-docling-258M" + ], + "title": "Created By" + }, + "text": { + "title": "Text", + "type": "string" + } + }, + "required": [ + "text" + ], + "title": "SummaryMetaField", + "type": "object" + }, "TableCell": { "description": "TableCell.", "properties": { @@ -2065,6 +2505,17 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "meta": { + "anyOf": [ + { + "$ref": "#/$defs/FloatingMeta" + }, + { + "type": "null" + } + ], + "default": null + }, "label": { "default": "table", "enum": [ @@ -2122,6 +2573,7 @@ }, "annotations": { "default": [], + "deprecated": true, "items": { "discriminator": { "mapping": { @@ -2150,6 +2602,67 @@ "title": "TableItem", "type": "object" }, + "TabularChartMetaField": { + "additionalProperties": true, + "description": "Tabular chart metadata field.", + "properties": { + "confidence": { + "anyOf": [ + { + "maximum": 1, + "minimum": 0, + "type": "number" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The confidence of the prediction.", + "examples": [ + 0.9, + 0.42 + ], + "title": "Confidence" + }, + "created_by": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The origin of the prediction.", + "examples": [ + "ibm-granite/granite-docling-258M" + ], + "title": "Created By" + }, + "title": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Title" + }, + "chart_data": { + "$ref": "#/$defs/TableData" + } + }, + "required": [ + "chart_data" + ], + "title": "TabularChartMetaField", + "type": "object" + }, "TextItem": { "additionalProperties": false, "description": "TextItem.", @@ -2182,6 +2695,17 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "meta": { + "anyOf": [ + { + "$ref": "#/$defs/BaseMeta" + }, + { + "type": "null" + } + ], + "default": null + }, "label": { "enum": [ "caption", @@ -2285,6 +2809,17 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "meta": { + "anyOf": [ + { + "$ref": "#/$defs/BaseMeta" + }, + { + "type": "null" + } + ], + "default": null + }, "label": { "const": "title", "default": "title", @@ -2355,7 +2890,7 @@ "type": "string" }, "version": { - "default": "1.7.0", + "default": "1.8.0", "pattern": "^(?P0|[1-9]\\d*)\\.(?P0|[1-9]\\d*)\\.(?P0|[1-9]\\d*)(?:-(?P(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+(?P[0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$", "title": "Version", "type": "string" @@ -2382,6 +2917,7 @@ "parent": null, "children": [], "content_layer": "furniture", + "meta": null, "name": "_root_", "label": "unspecified" }, @@ -2394,6 +2930,7 @@ "parent": null, "children": [], "content_layer": "body", + "meta": null, "name": "_root_", "label": "unspecified" } diff --git a/test/data/chunker/0_out_chunks.json b/test/data/chunker/0_out_chunks.json index 5eb6ff4c..f0eefe5a 100644 --- a/test/data/chunker/0_out_chunks.json +++ b/test/data/chunker/0_out_chunks.json @@ -13,6 +13,7 @@ }, "children": [], "content_layer": "body", + "meta": {}, "label": "picture", "prov": [ { @@ -852,36 +853,11 @@ } }, { - "text": "Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.\n\nIn this image, we can see some text and images.", + "text": "In this image, we can see some text and images.\n\nFigure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.", "meta": { "schema_name": "docling_core.transforms.chunker.DocMeta", "version": "1.0.0", "doc_items": [ - { - "self_ref": "#/texts/31", - "parent": { - "$ref": "#/pictures/1" - }, - "children": [], - "content_layer": "body", - "label": "caption", - "prov": [ - { - "page_no": 3, - "bbox": { - "l": 108.0, - "t": 570.003, - "r": 504.00300000000004, - "b": 550.542, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 134 - ] - } - ] - }, { "self_ref": "#/pictures/1", "parent": { @@ -938,6 +914,7 @@ } ], "content_layer": "body", + "meta": {}, "label": "picture", "prov": [ { @@ -955,6 +932,31 @@ ] } ] + }, + { + "self_ref": "#/texts/31", + "parent": { + "$ref": "#/pictures/1" + }, + "children": [], + "content_layer": "body", + "label": "caption", + "prov": [ + { + "page_no": 3, + "bbox": { + "l": 108.0, + "t": 570.003, + "r": 504.00300000000004, + "b": 550.542, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 134 + ] + } + ] } ], "headings": [ @@ -3791,6 +3793,7 @@ } ], "content_layer": "body", + "meta": {}, "label": "picture", "prov": [ { @@ -4108,6 +4111,7 @@ } ], "content_layer": "body", + "meta": {}, "label": "picture", "prov": [ { @@ -4376,6 +4380,7 @@ } ], "content_layer": "body", + "meta": {}, "label": "picture", "prov": [ { @@ -4606,6 +4611,7 @@ } ], "content_layer": "body", + "meta": {}, "label": "picture", "prov": [ { @@ -5079,6 +5085,7 @@ }, "children": [], "content_layer": "body", + "meta": {}, "label": "picture", "prov": [ { @@ -5208,6 +5215,7 @@ }, "children": [], "content_layer": "body", + "meta": {}, "label": "picture", "prov": [ { @@ -5771,6 +5779,7 @@ } ], "content_layer": "body", + "meta": {}, "label": "picture", "prov": [ { @@ -6292,6 +6301,7 @@ } ], "content_layer": "body", + "meta": {}, "label": "picture", "prov": [ { @@ -6382,6 +6392,7 @@ } ], "content_layer": "body", + "meta": {}, "label": "picture", "prov": [ { @@ -7193,6 +7204,7 @@ } ], "content_layer": "body", + "meta": {}, "label": "picture", "prov": [ { @@ -7879,6 +7891,7 @@ } ], "content_layer": "body", + "meta": {}, "label": "picture", "prov": [ { diff --git a/test/data/chunker/0b_out_chunks.json b/test/data/chunker/0b_out_chunks.json index a242c810..27ec0ce9 100644 --- a/test/data/chunker/0b_out_chunks.json +++ b/test/data/chunker/0b_out_chunks.json @@ -13,6 +13,7 @@ }, "children": [], "content_layer": "body", + "meta": {}, "label": "picture", "prov": [ { @@ -852,36 +853,11 @@ } }, { - "text": "Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.\n\nIn this image, we can see some text and images.", + "text": "In this image, we can see some text and images.\n\nFigure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.", "meta": { "schema_name": "docling_core.transforms.chunker.DocMeta", "version": "1.0.0", "doc_items": [ - { - "self_ref": "#/texts/31", - "parent": { - "$ref": "#/pictures/1" - }, - "children": [], - "content_layer": "body", - "label": "caption", - "prov": [ - { - "page_no": 3, - "bbox": { - "l": 108.0, - "t": 570.003, - "r": 504.00300000000004, - "b": 550.542, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 134 - ] - } - ] - }, { "self_ref": "#/pictures/1", "parent": { @@ -938,6 +914,7 @@ } ], "content_layer": "body", + "meta": {}, "label": "picture", "prov": [ { @@ -955,6 +932,31 @@ ] } ] + }, + { + "self_ref": "#/texts/31", + "parent": { + "$ref": "#/pictures/1" + }, + "children": [], + "content_layer": "body", + "label": "caption", + "prov": [ + { + "page_no": 3, + "bbox": { + "l": 108.0, + "t": 570.003, + "r": 504.00300000000004, + "b": 550.542, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 134 + ] + } + ] } ], "headings": [ @@ -3791,6 +3793,7 @@ } ], "content_layer": "body", + "meta": {}, "label": "picture", "prov": [ { @@ -4108,6 +4111,7 @@ } ], "content_layer": "body", + "meta": {}, "label": "picture", "prov": [ { @@ -4376,6 +4380,7 @@ } ], "content_layer": "body", + "meta": {}, "label": "picture", "prov": [ { @@ -4606,6 +4611,7 @@ } ], "content_layer": "body", + "meta": {}, "label": "picture", "prov": [ { @@ -5079,6 +5085,7 @@ }, "children": [], "content_layer": "body", + "meta": {}, "label": "picture", "prov": [ { @@ -5208,6 +5215,7 @@ }, "children": [], "content_layer": "body", + "meta": {}, "label": "picture", "prov": [ { @@ -5771,6 +5779,7 @@ } ], "content_layer": "body", + "meta": {}, "label": "picture", "prov": [ { @@ -6292,6 +6301,7 @@ } ], "content_layer": "body", + "meta": {}, "label": "picture", "prov": [ { @@ -6382,6 +6392,7 @@ } ], "content_layer": "body", + "meta": {}, "label": "picture", "prov": [ { @@ -7193,6 +7204,7 @@ } ], "content_layer": "body", + "meta": {}, "label": "picture", "prov": [ { @@ -7879,6 +7891,7 @@ } ], "content_layer": "body", + "meta": {}, "label": "picture", "prov": [ { diff --git a/test/data/doc/2206.01062.yaml.dt.json b/test/data/doc/2206.01062.yaml.dt.json index f954386b..83dd30ce 100644 --- a/test/data/doc/2206.01062.yaml.dt.json +++ b/test/data/doc/2206.01062.yaml.dt.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Document", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/2408.09869v3_enriched.gt.md b/test/data/doc/2408.09869v3_enriched.gt.md index 7669a2a8..a8604726 100644 --- a/test/data/doc/2408.09869v3_enriched.gt.md +++ b/test/data/doc/2408.09869v3_enriched.gt.md @@ -2,10 +2,10 @@ -Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible. - In this image, we can see some text and images. +Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible. + licensing (e.g. pymupdf [7]), poor speed or unrecoverable quality issues, such as merged text cells across far-apart text tokens or table columns (pypdfium, PyPDF) [15, 14]. diff --git a/test/data/doc/2408.09869v3_enriched.out.dt.json b/test/data/doc/2408.09869v3_enriched.out.dt.json index 7d3e159a..fec32692 100644 --- a/test/data/doc/2408.09869v3_enriched.out.dt.json +++ b/test/data/doc/2408.09869v3_enriched.out.dt.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Document", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.html b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.html index 3e166869..7f10d0ac 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.html +++ b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.html @@ -4,6 +4,7 @@

Docling Technical Report

+
In this image we can see a cartoon image of a duck holding a paper.

Version 1.0

Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar

AI4K Group, IBM Research R¨ uschlikon, Switzerland

diff --git a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_true.gt.html b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_true.gt.html index 0bb79d05..7f10d0ac 100644 --- a/test/data/doc/2408.09869v3_enriched_p1_include_annotations_true.gt.html +++ b/test/data/doc/2408.09869v3_enriched_p1_include_annotations_true.gt.html @@ -4,7 +4,7 @@

Docling Technical Report

-
In this image we can see a cartoon image of a duck holding a paper.
+
In this image we can see a cartoon image of a duck holding a paper.

Version 1.0

Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar

AI4K Group, IBM Research R¨ uschlikon, Switzerland

diff --git a/test/data/doc/2408.09869v3_enriched_p1_mark_meta_true.gt.md b/test/data/doc/2408.09869v3_enriched_p1_mark_meta_true.gt.md new file mode 100644 index 00000000..3f8a9266 --- /dev/null +++ b/test/data/doc/2408.09869v3_enriched_p1_mark_meta_true.gt.md @@ -0,0 +1,51 @@ +# Docling Technical Report + +[Description] In this image we can see a cartoon image of a duck holding a paper. + + + +Version 1.0 + +Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar + +AI4K Group, IBM Research R¨ uschlikon, Switzerland + +## Abstract + +This technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source package for PDF document conversion. It is powered by state-of-the-art specialized AI models for layout analysis (DocLayNet) and table structure recognition (TableFormer), and runs efficiently on commodity hardware in a small resource budget. The code interface allows for easy extensibility and addition of new features and models. + +## 1 Introduction + +Converting PDF documents back into a machine-processable format has been a major challenge for decades due to their huge variability in formats, weak standardization and printing-optimized characteristic, which discards most structural features and metadata. With the advent of LLMs and popular application patterns such as retrieval-augmented generation (RAG), leveraging the rich content embedded in PDFs has become ever more relevant. In the past decade, several powerful document understanding solutions have emerged on the market, most of which are commercial software, cloud offerings [3] and most recently, multi-modal vision-language models. As of today, only a handful of open-source tools cover PDF conversion, leaving a significant feature and quality gap to proprietary solutions. + +With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models. + +torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report. + +[Docling Legacy Misc] {'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'} + +summary: Typical Docling setup runtime characterization. +type: performance data + +Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads. + +| CPU | Thread budget | native backend | native backend | native backend | pypdfium backend | pypdfium backend | pypdfium backend | +|----------------------------------|-----------------|------------------|------------------|------------------|--------------------|--------------------|--------------------| +| | | TTS | Pages/s | Mem | TTS | Pages/s | Mem | +| Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB | +| (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB | + +## 5 Applications + +Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets. + +## 6 Future work and contributions + +Docling is designed to allow easy extension of the model library and pipelines. In the future, we plan to extend Docling with several more models, such as a figure-classifier model, an equationrecognition model, a code-recognition model and more. This will help improve the quality of conversion for specific types of content, as well as augment extracted document metadata with additional information. Further investment into testing and optimizing GPU acceleration as well as improving the Docling-native PDF backend are on our roadmap, too. + +We encourage everyone to propose or implement additional features and models, and will gladly take your inputs and contributions under review . The codebase of Docling is open for use and contribution, under the MIT license agreement and in alignment with our contributing guidelines included in the Docling repository. If you use Docling in your projects, please consider citing this technical report. + +## References + +- [1] J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/ JaidedAI/EasyOCR , 2024. Version: 1.7.0. +- [2] J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. Voznesensky, B. Bao, P. Bell, D. Berard, E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison, W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. Lazos, M. Lezcano, Y. Liang, J. Liang, Y. Lu, C. Luk, B. Maher, Y. Pan, C. Puhrsch, M. Reso, M. Saroufim, M. Y. Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang, X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S. Chintala. Pytorch 2: Faster diff --git a/test/data/doc/2408.09869v3_enriched_p1_mark_annotations_true.gt.md b/test/data/doc/2408.09869v3_enriched_p1_use_legacy_annotations_true_mark_annotations_true.gt.md similarity index 100% rename from test/data/doc/2408.09869v3_enriched_p1_mark_annotations_true.gt.md rename to test/data/doc/2408.09869v3_enriched_p1_use_legacy_annotations_true_mark_annotations_true.gt.md diff --git a/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.html b/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.html index f728cdb3..00bf0385 100644 --- a/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.html +++ b/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.html @@ -126,7 +126,8 @@

3.1 PDF backends

-
Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.
In this image, we can see some text and images.
+
In this image, we can see some text and images.
+
Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.

licensing (e.g. pymupdf [7]), poor speed or unrecoverable quality issues, such as merged text cells across far-apart text tokens or table columns (pypdfium, PyPDF) [15, 14].

We therefore decided to provide multiple backend choices, and additionally open-source a custombuilt PDF parser, which is based on the low-level qpdf [4] library. It is made available in a separate package named docling-parse and powers the default PDF backend in Docling. As an alternative, we provide a PDF backend relying on pypdfium , which may be a safe backup choice in certain cases, e.g. if issues are seen with particular font encodings.

3.2 AI models

@@ -148,6 +149,7 @@

Table Structure Recognition

torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report.

+
{'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'}
Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads.
CPUThread budgetnative backendpypdfium backend
TTSPages/sMemTTSPages/sMem
Apple M3 Max4177 s 167 s1.27 1.346.20 GB103 s 92 s2.18 2.452.56 GB
(16 cores) Intel(R) Xeon E5-269016 4 16375 s 244 s0.60 0.926.16 GB239 s 143 s0.94 1.572.42 GB

5 Applications

Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets.

diff --git a/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json b/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json index 7bbddf7b..7a501c17 100644 --- a/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json +++ b/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "2408.09869v3", "furniture": { "self_ref": "#/furniture", @@ -1901,6 +1901,12 @@ } ], "content_layer": "body", + "meta": { + "description": { + "created_by": "HuggingFaceTB/SmolVLM-256M-Instruct", + "text": "In this image, we can see some text and images." + } + }, "label": "picture", "prov": [ { @@ -1955,6 +1961,12 @@ } ], "content_layer": "body", + "meta": { + "docling_legacy__misc": { + "summary": "Typical Docling setup runtime characterization.", + "type": "performance data" + } + }, "label": "table", "prov": [ { diff --git a/test/data/doc/2408.09869v3_enriched_split.gt.html b/test/data/doc/2408.09869v3_enriched_split.gt.html index 33c39bc5..1adaa3d9 100644 --- a/test/data/doc/2408.09869v3_enriched_split.gt.html +++ b/test/data/doc/2408.09869v3_enriched_split.gt.html @@ -96,7 +96,8 @@

Docling Technical Report

-
In this image we can see a cartoon image of a duck holding a paper.
+
In this image we can see a cartoon image of a duck holding a paper.
+

Version 1.0

Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar

AI4K Group, IBM Research R¨ uschlikon, Switzerland

@@ -147,7 +148,8 @@

3.1 PDF backends

-
Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.
In this image, we can see some text and images.
+
In this image, we can see some text and images.
+
Figure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.

licensing (e.g. pymupdf [7]), poor speed or unrecoverable quality issues, such as merged text cells across far-apart text tokens or table columns (pypdfium, PyPDF) [15, 14].

We therefore decided to provide multiple backend choices, and additionally open-source a custombuilt PDF parser, which is based on the low-level qpdf [4] library. It is made available in a separate package named docling-parse and powers the default PDF backend in Docling. As an alternative, we provide a PDF backend relying on pypdfium , which may be a safe backup choice in certain cases, e.g. if issues are seen with particular font encodings.

3.2 AI models

@@ -193,6 +195,7 @@

4 Performance

torch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report.

+
{'summary': 'Typical Docling setup runtime characterization.', 'type': 'performance data'}
Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads.
CPUThread budgetnative backendpypdfium backend
TTSPages/sMemTTSPages/sMem
Apple M3 Max4177 s 167 s1.27 1.346.20 GB103 s 92 s2.18 2.452.56 GB
(16 cores) Intel(R) Xeon E5-269016 4 16375 s 244 s0.60 0.926.16 GB239 s 143 s0.94 1.572.42 GB

5 Applications

Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets.

@@ -273,12 +276,16 @@

KEYWORDS

PDF document conversion, layout segmentation, object-detection, data set, Machine Learning

ACM Reference Format:

Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043

-
In this image there is a table with some text on it.
-
In this image we can see a text.
+
In this image there is a table with some text on it.
+
+
In this image we can see a text.
+

AGL Energy Limited ABN 74 1

5 061 375

-
In this image I can see the cover of the book.
-
In this image there is a paper with some text on it.
+
In this image I can see the cover of the book.
+
+
In this image there is a paper with some text on it.
+

Figure 1: Four examples of complex page layouts across different document categories

KEYWORDS

PDF document conversion, layout segmentation, object-detection, data set, Machine Learning

@@ -303,11 +310,12 @@

ACMReference Format:

to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity.

5 EXPERIMENTS

The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this

-
In this image, we can see a table with some text.
+
In this image, we can see a table with some text.
+

Third, achienec

EXPERIMENTS

chalenongayouls ground-vuth dawa such WC

-
The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. +
The image is a line graph that shows the percentage of respondents who have completed a certain training program over a period of time. The x-axis represents the percentage of respondents, ranging from 0% to 70%. The y-axis represents the percentage of respondents, ranging from 0% to 70%. The graph shows a trend of increasing the percentage of respondents who have completed the training program over time. The graph has two lines: one for the training program and one for the percentage of respondents who have completed the training program. The line for the training program is shown to be increasing, while the line for the percentage of respondents who have completed the training program is decreasing. @@ -315,7 +323,8 @@

EXPERIMENTS

#### Training Program: - **Initial Data**: The graph shows a general increase in the percentage of respondents who have completed the training program. -- **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%.
+- **Peak**: The peak in the percentage of respondents who have completed the training program is around 70%.
+

Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.

paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.

In this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16].

@@ -341,18 +350,22 @@

Baselines for Object Detection

Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as %

between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.

of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric

-
The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. +
The image is a flat, two-dimensional representation of a letter "A" on a blue circle. The letter "A" is positioned in the center of the circle. The circle has a smooth, gradient background that transitions from a lighter blue at the top to a darker blue at the bottom. The gradient effect gives the letter "A" a three-dimensional appearance, making it appear three-dimensional. -The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. +The letter "A" is white, which is the standard color used for the letter "A" in many languages. The letter "A" is a capital letter, and it is positioned in the center of the circle. The circle itself is a simple, flat shape, which is often used in digital art and design to create a clean and simple design. -The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A"
-
In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318.
+The background of the circle is a gradient, transitioning from a lighter blue at the top to a darker blue at the bottom. This gradient effect creates a sense of depth and dimension, making the letter "A"
+
+
In this image, there is a table with two columns. The first column is labeled "Class label," and the second column is labeled "Count." The first row in the table has the label "Class label," and the count is 22524. The second row in the table has the label "Count," and the count is 6318.
+
class labelCount% of Totaltriple inter-annotator mAP @ 0.5-0.95 (%)
TrainTestValAllFinManSciLawPatTen
Caption225242.041.772.3284-8940-6186-9294-9995-9969-78n/a
Footnote63180.600.310.5883-91n/a10062-8885-94n/a82-97
Formula250272.251.902.9683-85n/an/a84-8786-96n/an/a
List-item18566017.1913.3415.8287-8874-8390-9297-9781-8575-8893-95
Page-footer708786.515.586.0093-9488-9095-9610092-9710096-98
Page-header580225.106.705.0685-8966-7690-9498-10091-9297-9981-86
Picture459764.212.785.3169-7156-5982-8669-8280-9566-7159-76
Section-header14288412.6015.7712.8583-8476-8190-9294-9587-9469-7378-86
Table347333.202.273.6077-8175-8083-8698-9958-8079-8470-85
Text51037745.8249.2845.0084-8681-8688-9389-9387-9271-7987-95
Title50710.470.300.5060-7224-6350-6394-10082-9668-7924-56
Total1107470941123998166653182-8371-7479-8189-9486-9171-7668-85
-
In this image I can see a blue circle.
+
In this image I can see a blue circle.
+

include publication repositories such as arXiv

Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-

annotated pages, from which we obtain accuracy ranges.

-
A table with different columns and rows.
+
A table with different columns and rows.
+
% of Total% of Total% of Totaltriple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)triple inter- annotator mAP @ 0.5-0.95 (%)
class labelCountTrainTestValAllFinManSciLawPatTen
Caption225242.041.772.3284-8940-6186-9294-9995-9969-78n/a
Footnote63180.600.310.5883-91n/a10062-8885-94n/a82-97
Formula250272.251.902.9683-85n/an/a84-8786-96n/an/a
List-item18566017.1913.3415.8287-8874-8390-9297-9781-8575-8893-95
Page- footer708786.515.586.0093-9488-9095-9610092-9710096-98
Page- header580225.106.705.0685-8966-7690-9498-10091-9297-9981-86
Picture459764.212.785.3169-7156-5982-8669-8280-9566-7159-76
Section- header14288412.6015.7712.8583-8476-8190-9294-9587-9469-7378-86
Table347333.202.273.6077-8175-8083-8698-9958-8079-8470-85
Text51037745.8249.2845.0084-8681-8688-9389-9387-9271-7987-95
Title50710.470.300.5060-7224-6350-6394-10082-9668-7924-56
Total1107470941123998166653182-8371-7479-8189-9486-9171-7668-85

3

,

@@ -363,7 +376,8 @@

Baselines for Object Detection

Title and

.

page. Specificity ensures that the choice of label is not ambiguous,

-
In this image there is a text in the middle.
+
In this image there is a text in the middle.
+

we distributed the annotation workload and performed continuous be annotated. We refrained from class labels that are very specific

only. For phases three and four, a group of 40 dedicated annotators while coverage ensures that all meaningful items on a page can

quality controls. Phase one and two required a small team of experts to a document category, such as

diff --git a/test/data/doc/barchart.gt.html b/test/data/doc/barchart.gt.html index 6ee917ef..05cbb81f 100644 --- a/test/data/doc/barchart.gt.html +++ b/test/data/doc/barchart.gt.html @@ -124,7 +124,8 @@
-
bar chart
Number of impellerssingle-frequencymulti-frequency
10.060.16
20.120.26
30.160.27
40.140.26
50.160.25
60.240.24
+
Bar chart
+
Number of impellerssingle-frequencymulti-frequency
10.060.16
20.120.26
30.160.27
40.140.26
50.160.25
60.240.24
diff --git a/test/data/doc/barchart.gt.md b/test/data/doc/barchart.gt.md index 84f58652..0adc5569 100644 --- a/test/data/doc/barchart.gt.md +++ b/test/data/doc/barchart.gt.md @@ -1,4 +1,4 @@ -bar chart +Bar chart diff --git a/test/data/doc/concatenated.json b/test/data/doc/concatenated.json index 47fe4990..e48e9d61 100644 --- a/test/data/doc/concatenated.json +++ b/test/data/doc/concatenated.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "2501.17887v1 + Untitled 1 + 2311.18481v1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.added_extracted_doc.json.gt b/test/data/doc/constructed_doc.added_extracted_doc.json.gt index 4013747b..ed878b6a 100644 --- a/test/data/doc/constructed_doc.added_extracted_doc.json.gt +++ b/test/data/doc/constructed_doc.added_extracted_doc.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.appended_child.json.gt b/test/data/doc/constructed_doc.appended_child.json.gt index 74b6fba7..d3e30ed0 100644 --- a/test/data/doc/constructed_doc.appended_child.json.gt +++ b/test/data/doc/constructed_doc.appended_child.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.bulk_item_addition.json.gt b/test/data/doc/constructed_doc.bulk_item_addition.json.gt index 257c5b90..a4379c8f 100644 --- a/test/data/doc/constructed_doc.bulk_item_addition.json.gt +++ b/test/data/doc/constructed_doc.bulk_item_addition.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.bulk_item_insertion.json.gt b/test/data/doc/constructed_doc.bulk_item_insertion.json.gt index ce4f7c6d..398c5c62 100644 --- a/test/data/doc/constructed_doc.bulk_item_insertion.json.gt +++ b/test/data/doc/constructed_doc.bulk_item_insertion.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.deleted_group.json.gt b/test/data/doc/constructed_doc.deleted_group.json.gt index 549ae6a0..5cea9068 100644 --- a/test/data/doc/constructed_doc.deleted_group.json.gt +++ b/test/data/doc/constructed_doc.deleted_group.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.deleted_items_range.json.gt b/test/data/doc/constructed_doc.deleted_items_range.json.gt index 91b37357..12ed02c5 100644 --- a/test/data/doc/constructed_doc.deleted_items_range.json.gt +++ b/test/data/doc/constructed_doc.deleted_items_range.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.deleted_picture.json.gt b/test/data/doc/constructed_doc.deleted_picture.json.gt index 85890f23..bfcdd153 100644 --- a/test/data/doc/constructed_doc.deleted_picture.json.gt +++ b/test/data/doc/constructed_doc.deleted_picture.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.deleted_text.json.gt b/test/data/doc/constructed_doc.deleted_text.json.gt index 45c03c2a..62d866c5 100644 --- a/test/data/doc/constructed_doc.deleted_text.json.gt +++ b/test/data/doc/constructed_doc.deleted_text.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.embedded.json.gt b/test/data/doc/constructed_doc.embedded.json.gt index 4ac0e019..8fff1a5b 100644 --- a/test/data/doc/constructed_doc.embedded.json.gt +++ b/test/data/doc/constructed_doc.embedded.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.embedded.yaml.gt b/test/data/doc/constructed_doc.embedded.yaml.gt index 15d93ce3..d2d171d2 100644 --- a/test/data/doc/constructed_doc.embedded.yaml.gt +++ b/test/data/doc/constructed_doc.embedded.yaml.gt @@ -1113,4 +1113,4 @@ texts: prov: [] self_ref: '#/texts/55' text: The end. -version: 1.7.0 +version: 1.8.0 diff --git a/test/data/doc/constructed_doc.extracted_with_deletion.json.gt b/test/data/doc/constructed_doc.extracted_with_deletion.json.gt index fc7a3b94..1712938c 100644 --- a/test/data/doc/constructed_doc.extracted_with_deletion.json.gt +++ b/test/data/doc/constructed_doc.extracted_with_deletion.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.inserted_extracted_doc.json.gt b/test/data/doc/constructed_doc.inserted_extracted_doc.json.gt index a31af507..68b1dceb 100644 --- a/test/data/doc/constructed_doc.inserted_extracted_doc.json.gt +++ b/test/data/doc/constructed_doc.inserted_extracted_doc.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.inserted_items_with_insert_*.json.gt b/test/data/doc/constructed_doc.inserted_items_with_insert_*.json.gt index 2722426c..7144b362 100644 --- a/test/data/doc/constructed_doc.inserted_items_with_insert_*.json.gt +++ b/test/data/doc/constructed_doc.inserted_items_with_insert_*.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.inserted_list_items_with_insert_*.json.gt b/test/data/doc/constructed_doc.inserted_list_items_with_insert_*.json.gt index 42044db6..ded60ef6 100644 --- a/test/data/doc/constructed_doc.inserted_list_items_with_insert_*.json.gt +++ b/test/data/doc/constructed_doc.inserted_list_items_with_insert_*.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.inserted_text.json.gt b/test/data/doc/constructed_doc.inserted_text.json.gt index 6c4285f4..abff4454 100644 --- a/test/data/doc/constructed_doc.inserted_text.json.gt +++ b/test/data/doc/constructed_doc.inserted_text.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.manipulated_table.json.gt b/test/data/doc/constructed_doc.manipulated_table.json.gt index e65dd7d8..66223131 100644 --- a/test/data/doc/constructed_doc.manipulated_table.json.gt +++ b/test/data/doc/constructed_doc.manipulated_table.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.referenced.json.gt b/test/data/doc/constructed_doc.referenced.json.gt index 8a11418f..da939109 100644 --- a/test/data/doc/constructed_doc.referenced.json.gt +++ b/test/data/doc/constructed_doc.referenced.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/constructed_doc.referenced.yaml.gt b/test/data/doc/constructed_doc.referenced.yaml.gt index bb291c11..2a2355a2 100644 --- a/test/data/doc/constructed_doc.referenced.yaml.gt +++ b/test/data/doc/constructed_doc.referenced.yaml.gt @@ -1113,4 +1113,4 @@ texts: prov: [] self_ref: '#/texts/55' text: The end. -version: 1.7.0 +version: 1.8.0 diff --git a/test/data/doc/constructed_doc.replaced_item.json.gt b/test/data/doc/constructed_doc.replaced_item.json.gt index 91b37357..12ed02c5 100644 --- a/test/data/doc/constructed_doc.replaced_item.json.gt +++ b/test/data/doc/constructed_doc.replaced_item.json.gt @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Untitled 1", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/doc_with_kv.dt.json b/test/data/doc/doc_with_kv.dt.json index d59bf29a..c56dbd34 100644 --- a/test/data/doc/doc_with_kv.dt.json +++ b/test/data/doc/doc_with_kv.dt.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Document", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/dummy_doc.yaml.html b/test/data/doc/dummy_doc.yaml.html index 0e9e69d5..1273bff5 100644 --- a/test/data/doc/dummy_doc.yaml.html +++ b/test/data/doc/dummy_doc.yaml.html @@ -125,8 +125,13 @@

DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis

-
Figure 1: Four examples of complex page layouts across different document categories
bar chart
...
CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1
-
A description annotation for this table.
+
...
+
Bar chart
+
CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1
+
{'myanalysis': {'prediction': 'abc'}, 'something_else': {'text': 'aaa'}}
+
Figure 1: Four examples of complex page layouts across different document categories
+
A description annotation for this table.
+
{'foo': 'bar'}
diff --git a/test/data/doc/dummy_doc.yaml.md b/test/data/doc/dummy_doc.yaml.md index bab71376..bd4e6b23 100644 --- a/test/data/doc/dummy_doc.yaml.md +++ b/test/data/doc/dummy_doc.yaml.md @@ -1,13 +1,17 @@ # DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis -Figure 1: Four examples of complex page layouts across different document categories - -bar chart - ... +Bar chart + CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 +{'myanalysis': {'prediction': 'abc'}, 'something_else': {'text': 'aaa'}} + +Figure 1: Four examples of complex page layouts across different document categories + A description annotation for this table. + +{'foo': 'bar'} diff --git a/test/data/doc/dummy_doc_2_prec.yaml b/test/data/doc/dummy_doc_2_prec.yaml index 60cca33f..03b3fa95 100644 --- a/test/data/doc/dummy_doc_2_prec.yaml +++ b/test/data/doc/dummy_doc_2_prec.yaml @@ -80,6 +80,34 @@ pictures: width: 231.0 uri: data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAIAAAACCAIAAAD91JpzAAAAC0lEQVR4nGNgQAYAAA4AAamRc7EAAAAASUVORK5CYII= label: picture + meta: + classification: + predictions: + - class_name: bar_chart + confidence: 0.8 + created_by: model1 + description: + created_by: model2 + text: '...' + docling_legacy__misc: + myanalysis: + prediction: abc + something_else: + text: aaa + molecule: + confidence: 1.0 + created_by: model3-1.0.0 + docling_legacy__class_name: chemistry_molecular_structure + docling_legacy__segmentation: + - - 0.0 + - 0.0 + - - 1.0 + - 0.0 + - - 0.0 + - 1.0 + - - 1.0 + - 1.0 + smi: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 parent: $ref: '#/body' prov: @@ -139,6 +167,12 @@ tables: width: 231.12 uri: data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAIAAAACCAIAAAD91JpzAAAAC0lEQVR4nGNgQAYAAA4AAamRc7EAAAAASUVORK5CYII= label: table + meta: + description: + created_by: model3 + text: A description annotation for this table. + docling_legacy__misc: + foo: bar parent: $ref: '#/body' prov: @@ -237,4 +271,4 @@ texts: self_ref: '#/texts/3' text: 'Figure 1: Four examples of complex page layouts across different document categories' -version: 1.7.0 +version: 1.8.0 diff --git a/test/data/doc/dummy_doc_mark_meta.md b/test/data/doc/dummy_doc_mark_meta.md new file mode 100644 index 00000000..f9f8cdf4 --- /dev/null +++ b/test/data/doc/dummy_doc_mark_meta.md @@ -0,0 +1,17 @@ +# DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis + +[Description] ... + +[Classification] Bar chart + +[Molecule] CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 + +[Docling Legacy Misc] {'myanalysis': {'prediction': 'abc'}, 'something_else': {'text': 'aaa'}} + +Figure 1: Four examples of complex page layouts across different document categories + + + +[Description] A description annotation for this table. + +[Docling Legacy Misc] {'foo': 'bar'} \ No newline at end of file diff --git a/test/data/doc/dummy_doc_with_meta.yaml b/test/data/doc/dummy_doc_with_meta.yaml new file mode 100644 index 00000000..bb4d0296 --- /dev/null +++ b/test/data/doc/dummy_doc_with_meta.yaml @@ -0,0 +1,249 @@ +body: + children: + - $ref: '#/texts/1' + - $ref: '#/pictures/0' + - $ref: '#/texts/3' + - $ref: '#/tables/0' + content_layer: body + label: unspecified + name: _root_ + self_ref: '#/body' +form_items: [] +furniture: + children: + - $ref: '#/texts/0' + content_layer: body + label: unspecified + name: _root_ + self_ref: '#/furniture' +groups: [] +key_value_items: [] +name: dummy_doc +origin: + binary_hash: 7954723514066505909 + filename: dummy_doc.pdf + mimetype: application/pdf +pages: + '1': + image: + dpi: 144 + mimetype: image/png + size: + height: 1166.0 + width: 1536.0 + uri: data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAIAAAACCAIAAAD91JpzAAAAC0lEQVR4nGNgQAYAAA4AAamRc7EAAAAASUVORK5CYII= + page_no: 1 + size: + height: 583.15 + width: 768.23 +pictures: +- annotations: + - kind: classification + predicted_classes: + - class_name: bar_chart + confidence: 0.78 + provenance: model1 + - kind: description + provenance: model2 + text: '...' + - class_name: chemistry_molecular_structure + confidence: 0.98 + kind: molecule_data + provenance: model3-1.0.0 + segmentation: + - - 0.0 + - 0.0 + - - 1.0 + - 0.0 + - - 0.0 + - 1.0 + - - 1.0 + - 1.0 + smi: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 + - content: + myanalysis: + prediction: abc + something_else: + text: aaa + kind: misc + captions: + - $ref: '#/texts/3' + children: + - $ref: '#/texts/2' + content_layer: body + footnotes: [] + image: + dpi: 72 + mimetype: image/png + size: + height: 351.0 + width: 231.0 + uri: data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAIAAAACCAIAAAD91JpzAAAAC0lEQVR4nGNgQAYAAA4AAamRc7EAAAAASUVORK5CYII= + label: picture + parent: + $ref: '#/body' + prov: + - bbox: + b: 623.4 + coord_origin: TOPLEFT + l: 456.3 + r: 702.5 + t: 145.8 + charspan: + - 0 + - 288 + page_no: 1 + references: [] + self_ref: '#/pictures/0' +- annotations: [] + captions: [] + children: [] + content_layer: body + footnotes: [] + image: + dpi: 72 + mimetype: image/png + size: + height: 2.0 + width: 2.0 + uri: data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAIAAAACCAIAAAD91JpzAAAAC0lEQVR4nGNgQAYAAA4AAamRc7EAAAAASUVORK5CYII= + label: picture + parent: + $ref: '#/body' + prov: [] + references: [] + self_ref: '#/pictures/1' +schema_name: DoclingDocument +tables: +- annotations: + - kind: description + provenance: model3 + text: A description annotation for this table. + - content: + foo: bar + kind: misc + captions: [] + children: [] + content_layer: body + data: + grid: [] + num_cols: 0 + num_rows: 0 + table_cells: [] + footnotes: [] + image: + dpi: 72 + mimetype: image/png + size: + height: 351.0 + width: 231.0 + uri: data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAIAAAACCAIAAAD91JpzAAAAC0lEQVR4nGNgQAYAAA4AAamRc7EAAAAASUVORK5CYII= + label: table + parent: + $ref: '#/body' + prov: + - bbox: + b: 334.4 + coord_origin: BOTTOMLEFT + l: 323.0 + r: 376.0 + t: 354.3 + charspan: + - 1 + - 423 + page_no: 1 + references: [] + self_ref: '#/tables/0' +texts: +- children: [] + content_layer: body + label: page_header + orig: arXiv:2206.01062v1 [cs.CV] 2 Jun 2022 + parent: + $ref: '#/furniture' + prov: + - bbox: + b: 476.2 + coord_origin: TOPLEFT + l: 21.3 + r: 35.2 + t: 52.3 + charspan: + - 1 + - 423 + page_no: 1 + self_ref: '#/texts/0' + text: arXiv:2206.01062v1 [cs.CV] 2 Jun 2022 +- children: [] + content_layer: body + label: title + meta: + summary: + confidence: 0.95 + docling_legacy__provenance: model1 + text: This is a title. + my_corp__foo: More stuff here. + orig: 'DocLayNet: A Large Human-Annotated Dataset for + + Document-Layout Analysis' + parent: + $ref: '#/body' + prov: + - bbox: + b: 53.4 + coord_origin: TOPLEFT + l: 65.0 + r: 623.2 + t: 30.1 + charspan: + - 1 + - 423 + page_no: 1 + self_ref: '#/texts/1' + text: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis' +- children: [] + content_layer: body + label: section_header + level: 1 + meta: + summary: + text: This is a section header. + orig: OPERATION (cont.) + parent: + $ref: '#/pictures/0' + prov: + - bbox: + b: 334.4 + coord_origin: BOTTOMLEFT + l: 323.0 + r: 376.0 + t: 354.3 + charspan: + - 0 + - 734 + page_no: 1 + self_ref: '#/texts/2' + text: OPERATION (cont.) +- children: [] + content_layer: body + label: caption + orig: 'Figure 1: Four examples of complex page layouts across dif- + + ferent document categories' + parent: + $ref: '#/body' + prov: + - bbox: + b: 334.4 + coord_origin: BOTTOMLEFT + l: 323.0 + r: 376.0 + t: 354.3 + charspan: + - 1 + - 423 + page_no: 1 + self_ref: '#/texts/3' + text: 'Figure 1: Four examples of complex page layouts across different document + categories' +version: 1.7.0 diff --git a/test/data/doc/dummy_doc_with_meta_modified.yaml b/test/data/doc/dummy_doc_with_meta_modified.yaml new file mode 100644 index 00000000..f7334672 --- /dev/null +++ b/test/data/doc/dummy_doc_with_meta_modified.yaml @@ -0,0 +1,286 @@ +body: + children: + - $ref: '#/texts/1' + - $ref: '#/pictures/0' + - $ref: '#/texts/3' + - $ref: '#/tables/0' + content_layer: body + label: unspecified + name: _root_ + self_ref: '#/body' +form_items: [] +furniture: + children: + - $ref: '#/texts/0' + content_layer: body + label: unspecified + name: _root_ + self_ref: '#/furniture' +groups: [] +key_value_items: [] +name: dummy_doc +origin: + binary_hash: 7954723514066505909 + filename: dummy_doc.pdf + mimetype: application/pdf +pages: + '1': + image: + dpi: 144 + mimetype: image/png + size: + height: 1166.0 + width: 1536.0 + uri: data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAIAAAACCAIAAAD91JpzAAAAC0lEQVR4nGNgQAYAAA4AAamRc7EAAAAASUVORK5CYII= + page_no: 1 + size: + height: 583.15 + width: 768.23 +pictures: +- annotations: + - kind: classification + predicted_classes: + - class_name: bar_chart + confidence: 0.78 + provenance: model1 + - kind: description + provenance: model2 + text: '...' + - class_name: chemistry_molecular_structure + confidence: 0.98 + kind: molecule_data + provenance: model3-1.0.0 + segmentation: + - - 0.0 + - 0.0 + - - 1.0 + - 0.0 + - - 0.0 + - 1.0 + - - 1.0 + - 1.0 + smi: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 + - content: + myanalysis: + prediction: abc + something_else: + text: aaa + kind: misc + captions: + - $ref: '#/texts/3' + children: + - $ref: '#/texts/2' + content_layer: body + footnotes: [] + image: + dpi: 72 + mimetype: image/png + size: + height: 351.0 + width: 231.0 + uri: data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAIAAAACCAIAAAD91JpzAAAAC0lEQVR4nGNgQAYAAA4AAamRc7EAAAAASUVORK5CYII= + label: picture + meta: + classification: + predictions: + - class_name: bar_chart + confidence: 0.78 + created_by: model1 + description: + created_by: model2 + text: '...' + docling_legacy__misc: + myanalysis: + prediction: abc + something_else: + text: aaa + molecule: + confidence: 0.98 + created_by: model3-1.0.0 + docling_legacy__class_name: chemistry_molecular_structure + docling_legacy__segmentation: + - - 0.0 + - 0.0 + - - 1.0 + - 0.0 + - - 0.0 + - 1.0 + - - 1.0 + - 1.0 + smi: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 + parent: + $ref: '#/body' + prov: + - bbox: + b: 623.4 + coord_origin: TOPLEFT + l: 456.3 + r: 702.5 + t: 145.8 + charspan: + - 0 + - 288 + page_no: 1 + references: [] + self_ref: '#/pictures/0' +- annotations: [] + captions: [] + children: [] + content_layer: body + footnotes: [] + image: + dpi: 72 + mimetype: image/png + size: + height: 2.0 + width: 2.0 + uri: data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAIAAAACCAIAAAD91JpzAAAAC0lEQVR4nGNgQAYAAA4AAamRc7EAAAAASUVORK5CYII= + label: picture + parent: + $ref: '#/body' + prov: [] + references: [] + self_ref: '#/pictures/1' +schema_name: DoclingDocument +tables: +- annotations: + - kind: description + provenance: model3 + text: A description annotation for this table. + - content: + foo: bar + kind: misc + captions: [] + children: [] + content_layer: body + data: + grid: [] + num_cols: 0 + num_rows: 0 + table_cells: [] + footnotes: [] + image: + dpi: 72 + mimetype: image/png + size: + height: 351.0 + width: 231.0 + uri: data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAIAAAACCAIAAAD91JpzAAAAC0lEQVR4nGNgQAYAAA4AAamRc7EAAAAASUVORK5CYII= + label: table + meta: + description: + created_by: model3 + text: A description annotation for this table. + docling_legacy__misc: + foo: bar + parent: + $ref: '#/body' + prov: + - bbox: + b: 334.4 + coord_origin: BOTTOMLEFT + l: 323.0 + r: 376.0 + t: 354.3 + charspan: + - 1 + - 423 + page_no: 1 + references: [] + self_ref: '#/tables/0' +texts: +- children: [] + content_layer: body + label: page_header + orig: arXiv:2206.01062v1 [cs.CV] 2 Jun 2022 + parent: + $ref: '#/furniture' + prov: + - bbox: + b: 476.2 + coord_origin: TOPLEFT + l: 21.3 + r: 35.2 + t: 52.3 + charspan: + - 1 + - 423 + page_no: 1 + self_ref: '#/texts/0' + text: arXiv:2206.01062v1 [cs.CV] 2 Jun 2022 +- children: [] + content_layer: body + label: title + meta: + my_corp__foo: More stuff here. + summary: + confidence: 0.95 + docling_legacy__provenance: model1 + text: This is a title. + orig: 'DocLayNet: A Large Human-Annotated Dataset for + + Document-Layout Analysis' + parent: + $ref: '#/body' + prov: + - bbox: + b: 53.4 + coord_origin: TOPLEFT + l: 65.0 + r: 623.2 + t: 30.1 + charspan: + - 1 + - 423 + page_no: 1 + self_ref: '#/texts/1' + text: 'DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis' +- children: [] + content_layer: body + label: section_header + level: 1 + meta: + my_corp__coords: + latitude: 8.5417 + longitude: 47.3769 + summary: + text: This is a section header. + orig: OPERATION (cont.) + parent: + $ref: '#/pictures/0' + prov: + - bbox: + b: 334.4 + coord_origin: BOTTOMLEFT + l: 323.0 + r: 376.0 + t: 354.3 + charspan: + - 0 + - 734 + page_no: 1 + self_ref: '#/texts/2' + text: OPERATION (cont.) +- children: [] + content_layer: body + label: caption + orig: 'Figure 1: Four examples of complex page layouts across dif- + + ferent document categories' + parent: + $ref: '#/body' + prov: + - bbox: + b: 334.4 + coord_origin: BOTTOMLEFT + l: 323.0 + r: 376.0 + t: 354.3 + charspan: + - 1 + - 423 + page_no: 1 + self_ref: '#/texts/3' + text: 'Figure 1: Four examples of complex page layouts across different document + categories' +version: 1.8.0 diff --git a/test/data/doc/group_with_metadata.yaml b/test/data/doc/group_with_metadata.yaml new file mode 100644 index 00000000..7cfd08d5 --- /dev/null +++ b/test/data/doc/group_with_metadata.yaml @@ -0,0 +1,128 @@ +body: + children: + - $ref: '#/groups/0' + content_layer: body + label: unspecified + meta: + summary: + text: This document talks about various topics. + name: _root_ + self_ref: '#/body' +form_items: [] +furniture: + children: [] + content_layer: furniture + label: unspecified + name: _root_ + self_ref: '#/furniture' +groups: +- children: + - $ref: '#/texts/0' + - $ref: '#/groups/1' + - $ref: '#/groups/3' + content_layer: body + label: chapter + meta: + summary: + text: This chapter discusses foo and bar. + name: '1' + parent: + $ref: '#/body' + self_ref: '#/groups/0' +- children: + - $ref: '#/texts/1' + - $ref: '#/groups/2' + content_layer: body + label: section + meta: + my_corp__test_1: custom field value 1 + summary: + text: This section talks about foo. + name: 1a + parent: + $ref: '#/groups/0' + self_ref: '#/groups/1' +- children: + - $ref: '#/texts/2' + - $ref: '#/texts/3' + content_layer: body + label: list + meta: + summary: + text: Here some foo specifics are listed. + name: group + parent: + $ref: '#/groups/1' + self_ref: '#/groups/2' +- children: + - $ref: '#/texts/4' + content_layer: body + label: section + meta: + my_corp__test_2: custom field value 2 + summary: + text: This section talks about bar. + name: 1b + parent: + $ref: '#/groups/0' + self_ref: '#/groups/3' +key_value_items: [] +name: '' +pages: {} +pictures: [] +schema_name: DoclingDocument +tables: [] +texts: +- children: [] + content_layer: body + label: text + orig: This is some introductory text. + parent: + $ref: '#/groups/0' + prov: [] + self_ref: '#/texts/0' + text: This is some introductory text. +- children: [] + content_layer: body + label: text + meta: + summary: + text: This paragraph provides more details about foo. + orig: Regarding foo... + parent: + $ref: '#/groups/1' + prov: [] + self_ref: '#/texts/1' + text: Regarding foo... +- children: [] + content_layer: body + enumerated: true + label: list_item + marker: '' + orig: lorem + parent: + $ref: '#/groups/2' + prov: [] + self_ref: '#/texts/2' + text: lorem +- children: [] + content_layer: body + enumerated: true + label: list_item + marker: '' + orig: ipsum + parent: + $ref: '#/groups/2' + prov: [] + self_ref: '#/texts/3' + text: ipsum +- children: [] + content_layer: body + label: text + orig: Regarding bar... + parent: + $ref: '#/groups/3' + prov: [] + self_ref: '#/texts/4' + text: Regarding bar... +version: 1.8.0 diff --git a/test/data/doc/group_with_metadata_allowed_meta_names.md b/test/data/doc/group_with_metadata_allowed_meta_names.md new file mode 100644 index 00000000..af1bad3d --- /dev/null +++ b/test/data/doc/group_with_metadata_allowed_meta_names.md @@ -0,0 +1,10 @@ +This is some introductory text. + +[My Corp Test 1] custom field value 1 + +Regarding foo... + +1. lorem +2. ipsum + +Regarding bar... \ No newline at end of file diff --git a/test/data/doc/group_with_metadata_blocked_meta_names.md b/test/data/doc/group_with_metadata_blocked_meta_names.md new file mode 100644 index 00000000..2662ad01 --- /dev/null +++ b/test/data/doc/group_with_metadata_blocked_meta_names.md @@ -0,0 +1,10 @@ +This is some introductory text. + +Regarding foo... + +1. lorem +2. ipsum + +[My Corp Test 2] custom field value 2 + +Regarding bar... \ No newline at end of file diff --git a/test/data/doc/group_with_metadata_default.md b/test/data/doc/group_with_metadata_default.md new file mode 100644 index 00000000..f018e510 --- /dev/null +++ b/test/data/doc/group_with_metadata_default.md @@ -0,0 +1,24 @@ +This document talks about various topics. + +This chapter discusses foo and bar. + +This is some introductory text. + +This section talks about foo. + +custom field value 1 + +This paragraph provides more details about foo. + +Regarding foo... + +Here some foo specifics are listed. + +1. lorem +2. ipsum + +This section talks about bar. + +custom field value 2 + +Regarding bar... \ No newline at end of file diff --git a/test/data/doc/group_with_metadata_marked.md b/test/data/doc/group_with_metadata_marked.md new file mode 100644 index 00000000..e73eaf0f --- /dev/null +++ b/test/data/doc/group_with_metadata_marked.md @@ -0,0 +1,24 @@ +[Summary] This document talks about various topics. + +[Summary] This chapter discusses foo and bar. + +This is some introductory text. + +[Summary] This section talks about foo. + +[My Corp Test 1] custom field value 1 + +[Summary] This paragraph provides more details about foo. + +Regarding foo... + +[Summary] Here some foo specifics are listed. + +1. lorem +2. ipsum + +[Summary] This section talks about bar. + +[My Corp Test 2] custom field value 2 + +Regarding bar... \ No newline at end of file diff --git a/test/data/doc/group_with_metadata_summaries.md b/test/data/doc/group_with_metadata_summaries.md new file mode 100644 index 00000000..45b927ec --- /dev/null +++ b/test/data/doc/group_with_metadata_summaries.md @@ -0,0 +1,11 @@ +[#/body] [GroupItem:unspecified] This document talks about various topics. + + [#/groups/0] [GroupItem:chapter] This chapter discusses foo and bar. + + [#/groups/1] [GroupItem:section] This section talks about foo. + + [#/texts/1] [TextItem:text] This paragraph provides more details about foo. + + [#/groups/2] [ListGroup:list] Here some foo specifics are listed. + + [#/groups/3] [GroupItem:section] This section talks about bar. \ No newline at end of file diff --git a/test/data/doc/group_with_metadata_without_non_meta.md b/test/data/doc/group_with_metadata_without_non_meta.md new file mode 100644 index 00000000..f8bdd082 --- /dev/null +++ b/test/data/doc/group_with_metadata_without_non_meta.md @@ -0,0 +1,15 @@ +[Summary] This document talks about various topics. + +[Summary] This chapter discusses foo and bar. + +[Summary] This section talks about foo. + +[My Corp Test 1] custom field value 1 + +[Summary] This paragraph provides more details about foo. + +[Summary] Here some foo specifics are listed. + +[Summary] This section talks about bar. + +[My Corp Test 2] custom field value 2 \ No newline at end of file diff --git a/test/data/doc/misplaced_list_items.norm.out.yaml b/test/data/doc/misplaced_list_items.norm.out.yaml index 1b33dd76..9fd7edff 100644 --- a/test/data/doc/misplaced_list_items.norm.out.yaml +++ b/test/data/doc/misplaced_list_items.norm.out.yaml @@ -81,4 +81,4 @@ texts: prov: [] self_ref: '#/texts/3' text: there -version: 1.7.0 +version: 1.8.0 diff --git a/test/data/doc/misplaced_list_items.out.yaml b/test/data/doc/misplaced_list_items.out.yaml index c334227e..18b5f978 100644 --- a/test/data/doc/misplaced_list_items.out.yaml +++ b/test/data/doc/misplaced_list_items.out.yaml @@ -81,4 +81,4 @@ texts: prov: [] self_ref: '#/texts/3' text: foo -version: 1.7.0 +version: 1.8.0 diff --git a/test/data/doc/page_with_pic.dt.json b/test/data/doc/page_with_pic.dt.json index b11d817e..966d2e02 100644 --- a/test/data/doc/page_with_pic.dt.json +++ b/test/data/doc/page_with_pic.dt.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Document", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/page_with_pic_from_files.dt.json b/test/data/doc/page_with_pic_from_files.dt.json index b11d817e..966d2e02 100644 --- a/test/data/doc/page_with_pic_from_files.dt.json +++ b/test/data/doc/page_with_pic_from_files.dt.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Document", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/page_without_pic.dt.json b/test/data/doc/page_without_pic.dt.json index 10cd83b9..11a6aede 100644 --- a/test/data/doc/page_without_pic.dt.json +++ b/test/data/doc/page_without_pic.dt.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "Document", "furniture": { "self_ref": "#/furniture", diff --git a/test/data/doc/rich_table.out.yaml b/test/data/doc/rich_table.out.yaml index c5f8eecc..76d76a38 100644 --- a/test/data/doc/rich_table.out.yaml +++ b/test/data/doc/rich_table.out.yaml @@ -499,4 +499,4 @@ texts: prov: [] self_ref: '#/texts/5' text: More text in the group. -version: 1.7.0 +version: 1.8.0 diff --git a/test/data/doc/rich_table_item_ins_norm_1.out.yaml b/test/data/doc/rich_table_item_ins_norm_1.out.yaml index fecd739d..bfd0788d 100644 --- a/test/data/doc/rich_table_item_ins_norm_1.out.yaml +++ b/test/data/doc/rich_table_item_ins_norm_1.out.yaml @@ -240,4 +240,4 @@ texts: prov: [] self_ref: '#/texts/1' text: text in italic -version: 1.7.0 +version: 1.8.0 diff --git a/test/data/doc/rich_table_item_ins_norm_2.out.yaml b/test/data/doc/rich_table_item_ins_norm_2.out.yaml index f2e05e5b..8cdfc00d 100644 --- a/test/data/doc/rich_table_item_ins_norm_2.out.yaml +++ b/test/data/doc/rich_table_item_ins_norm_2.out.yaml @@ -250,4 +250,4 @@ texts: prov: [] self_ref: '#/texts/2' text: text before -version: 1.7.0 +version: 1.8.0 diff --git a/test/data/doc/rich_table_item_ins_norm_3.out.yaml b/test/data/doc/rich_table_item_ins_norm_3.out.yaml index b35564ff..71f3e2a0 100644 --- a/test/data/doc/rich_table_item_ins_norm_3.out.yaml +++ b/test/data/doc/rich_table_item_ins_norm_3.out.yaml @@ -250,4 +250,4 @@ texts: prov: [] self_ref: '#/texts/2' text: text in italic -version: 1.7.0 +version: 1.8.0 diff --git a/test/data/doc/rich_table_post_text_del.out.yaml b/test/data/doc/rich_table_post_text_del.out.yaml index 67e086d5..42d71415 100644 --- a/test/data/doc/rich_table_post_text_del.out.yaml +++ b/test/data/doc/rich_table_post_text_del.out.yaml @@ -489,4 +489,4 @@ texts: prov: [] self_ref: '#/texts/4' text: More text in the group. -version: 1.7.0 +version: 1.8.0 diff --git a/test/data/docling_document/unit/CodeItem.yaml b/test/data/docling_document/unit/CodeItem.yaml index 09995640..c263a4f4 100644 --- a/test/data/docling_document/unit/CodeItem.yaml +++ b/test/data/docling_document/unit/CodeItem.yaml @@ -13,3 +13,4 @@ self_ref: '#' text: print(Hello World!) formatting: null hyperlink: null +meta: null diff --git a/test/data/docling_document/unit/FloatingItem.yaml b/test/data/docling_document/unit/FloatingItem.yaml index 21beef40..0c11c8f3 100644 --- a/test/data/docling_document/unit/FloatingItem.yaml +++ b/test/data/docling_document/unit/FloatingItem.yaml @@ -7,4 +7,5 @@ parent: null prov: [] references: [] self_ref: '#' -content_layer: body \ No newline at end of file +content_layer: body +meta: null diff --git a/test/data/docling_document/unit/FormItem.yaml b/test/data/docling_document/unit/FormItem.yaml index af7a61e1..f296d801 100644 --- a/test/data/docling_document/unit/FormItem.yaml +++ b/test/data/docling_document/unit/FormItem.yaml @@ -25,6 +25,7 @@ graph: target_cell_id: 0 image: null label: form +meta: null parent: null prov: [] references: [] diff --git a/test/data/docling_document/unit/FormulaItem.yaml b/test/data/docling_document/unit/FormulaItem.yaml index 25057908..cd631ff5 100644 --- a/test/data/docling_document/unit/FormulaItem.yaml +++ b/test/data/docling_document/unit/FormulaItem.yaml @@ -8,3 +8,4 @@ text: E=mc^2 content_layer: body formatting: null hyperlink: null +meta: null diff --git a/test/data/docling_document/unit/KeyValueItem.yaml b/test/data/docling_document/unit/KeyValueItem.yaml index 219e951e..f6db93e1 100644 --- a/test/data/docling_document/unit/KeyValueItem.yaml +++ b/test/data/docling_document/unit/KeyValueItem.yaml @@ -25,6 +25,7 @@ graph: target_cell_id: 0 image: null label: key_value_region +meta: null parent: null prov: [] references: [] diff --git a/test/data/docling_document/unit/ListItem.yaml b/test/data/docling_document/unit/ListItem.yaml index 20d8de90..300661d3 100644 --- a/test/data/docling_document/unit/ListItem.yaml +++ b/test/data/docling_document/unit/ListItem.yaml @@ -10,3 +10,4 @@ text: whatever content_layer: body formatting: null hyperlink: null +meta: null diff --git a/test/data/docling_document/unit/PictureItem.yaml b/test/data/docling_document/unit/PictureItem.yaml index ffe342a6..3fc72158 100644 --- a/test/data/docling_document/unit/PictureItem.yaml +++ b/test/data/docling_document/unit/PictureItem.yaml @@ -8,4 +8,5 @@ parent: null prov: [] references: [] self_ref: '#' -content_layer: body \ No newline at end of file +content_layer: body +meta: null diff --git a/test/data/docling_document/unit/SectionHeaderItem.yaml b/test/data/docling_document/unit/SectionHeaderItem.yaml index 68f641f9..1ab1a526 100644 --- a/test/data/docling_document/unit/SectionHeaderItem.yaml +++ b/test/data/docling_document/unit/SectionHeaderItem.yaml @@ -9,3 +9,4 @@ text: whatever content_layer: body formatting: null hyperlink: null +meta: null diff --git a/test/data/docling_document/unit/TableItem.yaml b/test/data/docling_document/unit/TableItem.yaml index ae08e00e..b93a89bd 100644 --- a/test/data/docling_document/unit/TableItem.yaml +++ b/test/data/docling_document/unit/TableItem.yaml @@ -194,3 +194,4 @@ references: [] self_ref: '#' content_layer: body annotations: [] +meta: null diff --git a/test/data/docling_document/unit/TextItem.yaml b/test/data/docling_document/unit/TextItem.yaml index 1f72637a..7061046a 100644 --- a/test/data/docling_document/unit/TextItem.yaml +++ b/test/data/docling_document/unit/TextItem.yaml @@ -8,3 +8,4 @@ text: whatever content_layer: body formatting: null hyperlink: null +meta: null diff --git a/test/data/docling_document/unit/TitleItem.yaml b/test/data/docling_document/unit/TitleItem.yaml index 8e2a3dea..7fcbb4cc 100644 --- a/test/data/docling_document/unit/TitleItem.yaml +++ b/test/data/docling_document/unit/TitleItem.yaml @@ -8,3 +8,4 @@ text: whatever content_layer: body formatting: null hyperlink: null +meta: null diff --git a/test/data/legacy_doc/doc-export.docling.yaml.gt b/test/data/legacy_doc/doc-export.docling.yaml.gt index 4fc4a7fa..3dfac982 100644 --- a/test/data/legacy_doc/doc-export.docling.yaml.gt +++ b/test/data/legacy_doc/doc-export.docling.yaml.gt @@ -6822,4 +6822,4 @@ texts: text: '23. Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for document layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1015-1022. IEEE (2019)' -version: 1.7.0 +version: 1.8.0 diff --git a/test/test_metadata.py b/test/test_metadata.py new file mode 100644 index 00000000..2eec3e8d --- /dev/null +++ b/test/test_metadata.py @@ -0,0 +1,301 @@ +from pathlib import Path +from typing import Any, Optional + +import pytest +from pydantic import BaseModel +from typing_extensions import override + +from docling_core.transforms.serializer.base import SerializationResult +from docling_core.transforms.serializer.common import create_ser_result +from docling_core.transforms.serializer.markdown import ( + MarkdownDocSerializer, + MarkdownMetaSerializer, + MarkdownParams, +) +from docling_core.types.doc import ( + BaseMeta, + DocItem, + DocItemLabel, + DoclingDocument, + GroupLabel, + MetaFieldName, + MetaUtils, + NodeItem, + RefItem, + SummaryMetaField, +) + +from .test_data_gen_flag import GEN_TEST_DATA + + +class CustomCoordinates(BaseModel): + longitude: float + latitude: float + + +def test_metadata_usage(): + src = Path("test/data/doc/dummy_doc_with_meta.yaml") + doc = DoclingDocument.load_from_yaml(filename=src) + example_item: NodeItem = RefItem(cref="#/texts/2").resolve(doc=doc) + assert example_item.meta is not None + + # add a custom metadata object to the item + value = CustomCoordinates(longitude=47.3769, latitude=8.5417) + target_name = example_item.meta.set_custom_field( + namespace="my_corp", name="coords", value=value + ) + assert target_name == "my_corp__coords" + + # save the document + exp_file = src.parent / f"{src.stem}_modified.yaml" + if GEN_TEST_DATA: + doc.save_as_yaml(filename=exp_file) + else: + expected = DoclingDocument.load_from_yaml(filename=exp_file) + assert doc.model_dump(mode="json") == expected.model_dump(mode="json") + + # load back the document and read the custom metadata object + loaded_doc = DoclingDocument.load_from_yaml(filename=exp_file) + loaded_item: NodeItem = RefItem(cref="#/texts/2").resolve(doc=loaded_doc) + assert loaded_item.meta is not None + + loaded_dict = loaded_item.meta.get_custom_part()[target_name] + loaded_value = CustomCoordinates.model_validate(loaded_dict) + + # ensure the value is the same + assert loaded_value == value + + +def test_namespace_absence_raises(): + src = Path("test/data/doc/dummy_doc_with_meta.yaml") + doc = DoclingDocument.load_from_yaml(filename=src) + example_item = RefItem(cref="#/texts/2").resolve(doc=doc) + + with pytest.raises(ValueError): + example_item.meta.my_corp_programmaticaly_added_field = True + + +def _create_doc_with_group_with_metadata() -> DoclingDocument: + doc = DoclingDocument(name="") + doc.body.meta = BaseMeta( + summary=SummaryMetaField(text="This document talks about various topics.") + ) + grp1 = doc.add_group(name="1", label=GroupLabel.CHAPTER) + grp1.meta = BaseMeta( + summary=SummaryMetaField(text="This chapter discusses foo and bar.") + ) + doc.add_text( + text="This is some introductory text.", label=DocItemLabel.TEXT, parent=grp1 + ) + + grp1a = doc.add_group(parent=grp1, name="1a", label=GroupLabel.SECTION) + grp1a.meta = BaseMeta( + summary=SummaryMetaField(text="This section talks about foo.") + ) + grp1a.meta.set_custom_field( + namespace="my_corp", name="test_1", value="custom field value 1" + ) + txt1 = doc.add_text(text="Regarding foo...", label=DocItemLabel.TEXT, parent=grp1a) + txt1.meta = BaseMeta( + summary=SummaryMetaField(text="This paragraph provides more details about foo.") + ) + lst1a = doc.add_list_group(parent=grp1a) + lst1a.meta = BaseMeta( + summary=SummaryMetaField(text="Here some foo specifics are listed.") + ) + doc.add_list_item(text="lorem", parent=lst1a, enumerated=True) + doc.add_list_item(text="ipsum", parent=lst1a, enumerated=True) + + grp1b = doc.add_group(parent=grp1, name="1b", label=GroupLabel.SECTION) + grp1b.meta = BaseMeta( + summary=SummaryMetaField(text="This section talks about bar.") + ) + grp1b.meta.set_custom_field( + namespace="my_corp", name="test_2", value="custom field value 2" + ) + doc.add_text(text="Regarding bar...", label=DocItemLabel.TEXT, parent=grp1b) + + return doc + + +def test_ser_deser(): + doc = _create_doc_with_group_with_metadata() + + # test dumping to and loading from YAML + exp_file = Path("test/data/doc/group_with_metadata.yaml") + if GEN_TEST_DATA: + doc.save_as_yaml(filename=exp_file) + else: + expected = DoclingDocument.load_from_yaml(filename=exp_file) + assert doc == expected + + +def test_md_ser_default(): + doc = _create_doc_with_group_with_metadata() + + # test exporting to Markdown + params = MarkdownParams() + ser = MarkdownDocSerializer(doc=doc, params=params) + ser_res = ser.serialize() + actual = ser_res.text + exp_file = Path("test/data/doc/group_with_metadata_default.md") + if GEN_TEST_DATA: + with open(exp_file, "w", encoding="utf-8") as f: + f.write(actual) + else: + with open(exp_file, "r", encoding="utf-8") as f: + expected = f.read() + assert actual == expected + + +def test_md_ser_marked(): + doc = _create_doc_with_group_with_metadata() + + # test exporting to Markdown + params = MarkdownParams( + mark_meta=True, + ) + ser = MarkdownDocSerializer(doc=doc, params=params) + ser_res = ser.serialize() + actual = ser_res.text + exp_file = Path("test/data/doc/group_with_metadata_marked.md") + if GEN_TEST_DATA: + with open(exp_file, "w", encoding="utf-8") as f: + f.write(actual) + else: + with open(exp_file, "r", encoding="utf-8") as f: + expected = f.read() + assert actual == expected + + +def test_md_ser_allowed_meta_names(): + doc = _create_doc_with_group_with_metadata() + params = MarkdownParams( + allowed_meta_names={ + MetaUtils.create_meta_field_name(namespace="my_corp", name="test_1"), + }, + mark_meta=True, + ) + ser = MarkdownDocSerializer(doc=doc, params=params) + ser_res = ser.serialize() + actual = ser_res.text + exp_file = Path("test/data/doc/group_with_metadata_allowed_meta_names.md") + if GEN_TEST_DATA: + with open(exp_file, "w", encoding="utf-8") as f: + f.write(actual) + else: + with open(exp_file, "r", encoding="utf-8") as f: + expected = f.read() + assert actual == expected + + +def test_md_ser_blocked_meta_names(): + doc = _create_doc_with_group_with_metadata() + params = MarkdownParams( + blocked_meta_names={ + MetaUtils.create_meta_field_name(namespace="my_corp", name="test_1"), + MetaFieldName.SUMMARY.value, + }, + mark_meta=True, + ) + ser = MarkdownDocSerializer(doc=doc, params=params) + ser_res = ser.serialize() + actual = ser_res.text + exp_file = Path("test/data/doc/group_with_metadata_blocked_meta_names.md") + if GEN_TEST_DATA: + with open(exp_file, "w", encoding="utf-8") as f: + f.write(actual) + else: + with open(exp_file, "r", encoding="utf-8") as f: + expected = f.read() + assert actual == expected + + +def test_md_ser_without_non_meta(): + doc = _create_doc_with_group_with_metadata() + params = MarkdownParams( + include_non_meta=False, + mark_meta=True, + ) + ser = MarkdownDocSerializer(doc=doc, params=params) + ser_res = ser.serialize() + actual = ser_res.text + exp_file = Path("test/data/doc/group_with_metadata_without_non_meta.md") + if GEN_TEST_DATA: + with open(exp_file, "w", encoding="utf-8") as f: + f.write(actual) + else: + with open(exp_file, "r", encoding="utf-8") as f: + expected = f.read() + assert actual == expected + + +def test_ser_custom_meta_serializer(): + + class SummaryMarkdownMetaSerializer(MarkdownMetaSerializer): + + @override + def serialize( + self, + *, + item: NodeItem, + doc: DoclingDocument, + level: Optional[int] = None, + **kwargs: Any, + ) -> SerializationResult: + """Serialize the item's meta.""" + params = MarkdownParams(**kwargs) + return create_ser_result( + text="\n\n".join( + [ + f"{' ' * (level or 0)}[{item.self_ref}] [{item.__class__.__name__}:{item.label.value}] {tmp}" # type:ignore[attr-defined] + for key in ( + list(item.meta.__class__.model_fields) + + list(item.meta.get_custom_part()) + ) + if ( + tmp := self._serialize_meta_field( + item.meta, key, params.mark_meta + ) + ) + ] + if item.meta + else [] + ), + span_source=item if isinstance(item, DocItem) else [], + ) + + def _serialize_meta_field( + self, meta: BaseMeta, name: str, mark_meta: bool + ) -> Optional[str]: + if (field_val := getattr(meta, name)) is not None and isinstance( + field_val, SummaryMetaField + ): + txt = field_val.text + return ( + f"[{self._humanize_text(name, title=True)}] {txt}" + if mark_meta + else txt + ) + else: + return None + + doc = _create_doc_with_group_with_metadata() + + # test exporting to Markdown + params = MarkdownParams( + include_non_meta=False, + ) + ser = MarkdownDocSerializer( + doc=doc, params=params, meta_serializer=SummaryMarkdownMetaSerializer() + ) + ser_res = ser.serialize() + actual = ser_res.text + exp_file = Path("test/data/doc/group_with_metadata_summaries.md") + if GEN_TEST_DATA: + with open(exp_file, "w", encoding="utf-8") as f: + f.write(actual) + else: + with open(exp_file, "r", encoding="utf-8") as f: + expected = f.read() + assert actual == expected diff --git a/test/test_serialization.py b/test/test_serialization.py index a8ce96e4..a8ebcdaa 100644 --- a/test/test_serialization.py +++ b/test/test_serialization.py @@ -263,7 +263,7 @@ def test_md_list_item_markers(): ) -def test_md_include_annotations_false(): +def test_md_legacy_include_annotations_false(): src = Path("./test/data/doc/2408.09869v3_enriched.json") doc = DoclingDocument.load_from_json(src) @@ -271,6 +271,7 @@ def test_md_include_annotations_false(): doc=doc, table_serializer=CustomAnnotationTableSerializer(), params=MarkdownParams( + use_legacy_annotations=True, include_annotations=False, pages={1, 5}, ), @@ -282,7 +283,7 @@ def test_md_include_annotations_false(): ) -def test_md_mark_annotations_false(): +def test_md_legacy_mark_annotations_false(): src = Path("./test/data/doc/2408.09869v3_enriched.json") doc = DoclingDocument.load_from_json(src) @@ -290,6 +291,7 @@ def test_md_mark_annotations_false(): doc=doc, table_serializer=CustomAnnotationTableSerializer(), params=MarkdownParams( + use_legacy_annotations=True, include_annotations=True, mark_annotations=False, pages={1, 5}, @@ -302,7 +304,7 @@ def test_md_mark_annotations_false(): ) -def test_md_mark_annotations_true(): +def test_md_mark_meta_true(): src = Path("./test/data/doc/2408.09869v3_enriched.json") doc = DoclingDocument.load_from_json(src) @@ -310,6 +312,26 @@ def test_md_mark_annotations_true(): doc=doc, table_serializer=CustomAnnotationTableSerializer(), params=MarkdownParams( + mark_meta=True, + pages={1, 5}, + ), + ) + actual = ser.serialize().text + verify( + exp_file=src.parent / f"{src.stem}_p1_mark_meta_true.gt.md", + actual=actual, + ) + + +def test_md_legacy_mark_annotations_true(): + src = Path("./test/data/doc/2408.09869v3_enriched.json") + doc = DoclingDocument.load_from_json(src) + + ser = MarkdownDocSerializer( + doc=doc, + table_serializer=CustomAnnotationTableSerializer(), + params=MarkdownParams( + use_legacy_annotations=True, include_annotations=True, mark_annotations=True, pages={1, 5}, @@ -317,7 +339,8 @@ def test_md_mark_annotations_true(): ) actual = ser.serialize().text verify( - exp_file=src.parent / f"{src.stem}_p1_mark_annotations_true.gt.md", + exp_file=src.parent + / f"{src.stem}_p1_use_legacy_annotations_true_mark_annotations_true.gt.md", actual=actual, )