Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions docling_core/transforms/serializer/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from pydantic import AnyUrl, BaseModel

from docling_core.types.doc.document import (
ChartItem,
DocItem,
DoclingDocument,
FloatingItem,
Expand Down Expand Up @@ -82,6 +83,22 @@ def serialize(
...


class BaseChartSerializer(ABC):
"""Basr class for chart item serializers."""

@abstractmethod
def serialize(
self,
*,
item: ChartItem,
doc_serializer: "BaseDocSerializer",
doc: DoclingDocument,
**kwargs: Any,
) -> SerializationResult:
"""Serializes the passed item."""
...


class BasePictureSerializer(ABC):
"""Base class for picture item serializers."""

Expand Down
11 changes: 11 additions & 0 deletions docling_core/transforms/serializer/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from docling_core.transforms.serializer.base import (
BaseAnnotationSerializer,
BaseChartSerializer,
BaseDocSerializer,
BaseFallbackSerializer,
BaseFormSerializer,
Expand All @@ -30,6 +31,7 @@
)
from docling_core.types.doc.document import (
DOCUMENT_TOKENS_EXPORT_LABELS,
ChartItem,
ContentLayer,
DescriptionAnnotation,
DocItem,
Expand Down Expand Up @@ -207,6 +209,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):

text_serializer: BaseTextSerializer
table_serializer: BaseTableSerializer
chart_serializer: BaseChartSerializer
picture_serializer: BasePictureSerializer
key_value_serializer: BaseKeyValueSerializer
form_serializer: BaseFormSerializer
Expand Down Expand Up @@ -362,6 +365,14 @@ def serialize(
visited=my_visited,
**my_kwargs,
)
elif isinstance(item, ChartItem):
part = self.chart_serializer.serialize(
item=item,
doc_serializer=self,
doc=self.doc,
visited=my_visited,
**my_kwargs,
)
elif isinstance(item, PictureItem):
part = self.picture_serializer.serialize(
item=item,
Expand Down
20 changes: 20 additions & 0 deletions docling_core/transforms/serializer/doctags.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from docling_core.transforms.serializer.base import (
BaseAnnotationSerializer,
BaseChartSerializer,
BaseDocSerializer,
BaseFallbackSerializer,
BaseFormSerializer,
Expand All @@ -27,6 +28,7 @@
)
from docling_core.types.doc.base import BoundingBox
from docling_core.types.doc.document import (
ChartItem,
CodeItem,
DocItem,
DoclingDocument,
Expand Down Expand Up @@ -207,6 +209,23 @@ def serialize(
return create_ser_result(text=text_res, span_source=res_parts)


class DocTagsChartSerializer(BaseChartSerializer):
"""DocTags-specific chart item serializer."""

@override
def serialize(
self,
*,
item: ChartItem,
doc_serializer: BaseDocSerializer,
doc: DoclingDocument,
**kwargs: Any,
) -> SerializationResult:
"""Serializes the passed item."""
# TODO add actual implementation
return create_ser_result()


class DocTagsPictureSerializer(BasePictureSerializer):
"""DocTags-specific picture item serializer."""

Expand Down Expand Up @@ -539,6 +558,7 @@ class DocTagsDocSerializer(DocSerializer):

text_serializer: BaseTextSerializer = DocTagsTextSerializer()
table_serializer: BaseTableSerializer = DocTagsTableSerializer()
chart_serializer: BaseChartSerializer = DocTagsChartSerializer()
picture_serializer: BasePictureSerializer = DocTagsPictureSerializer()
key_value_serializer: BaseKeyValueSerializer = DocTagsKeyValueSerializer()
form_serializer: BaseFormSerializer = DocTagsFormSerializer()
Expand Down
20 changes: 20 additions & 0 deletions docling_core/transforms/serializer/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

from docling_core.transforms.serializer.base import (
BaseAnnotationSerializer,
BaseChartSerializer,
BaseDocSerializer,
BaseFallbackSerializer,
BaseFormSerializer,
Expand All @@ -46,6 +47,7 @@
from docling_core.transforms.visualizer.base import BaseVisualizer
from docling_core.types.doc.base import ImageRefMode
from docling_core.types.doc.document import (
ChartItem,
CodeItem,
ContentLayer,
DescriptionAnnotation,
Expand Down Expand Up @@ -406,6 +408,23 @@ def serialize(
return create_ser_result(text=text_res, span_source=res_parts)


class HTMLChartSerializer(BaseChartSerializer):
"""HTML-specific chart item serializer."""

@override
def serialize(
self,
*,
item: ChartItem,
doc_serializer: BaseDocSerializer,
doc: DoclingDocument,
**kwargs: Any,
) -> SerializationResult:
"""Export chart to HTML format."""
# TODO add actual implementation
return create_ser_result()


class HTMLPictureSerializer(BasePictureSerializer):
"""HTML-specific picture item serializer."""

Expand Down Expand Up @@ -850,6 +869,7 @@ class HTMLDocSerializer(DocSerializer):

text_serializer: BaseTextSerializer = HTMLTextSerializer()
table_serializer: BaseTableSerializer = HTMLTableSerializer()
chart_serializer: BaseChartSerializer = HTMLChartSerializer()
picture_serializer: BasePictureSerializer = HTMLPictureSerializer()
key_value_serializer: BaseKeyValueSerializer = HTMLKeyValueSerializer()
form_serializer: BaseFormSerializer = HTMLFormSerializer()
Expand Down
59 changes: 59 additions & 0 deletions docling_core/transforms/serializer/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from docling_core.transforms.serializer.base import (
BaseAnnotationSerializer,
BaseChartSerializer,
BaseDocSerializer,
BaseFallbackSerializer,
BaseFormSerializer,
Expand All @@ -36,6 +37,7 @@
)
from docling_core.types.doc.base import ImageRefMode
from docling_core.types.doc.document import (
ChartItem,
CodeItem,
ContentLayer,
DescriptionAnnotation,
Expand Down Expand Up @@ -357,6 +359,62 @@ def serialize(
return create_ser_result(text=text_res, span_source=res_parts)


class MarkdownChartSerializer(BaseChartSerializer):
"""Mardown-specific chart item serializer."""

@override
def serialize(
self,
*,
item: ChartItem,
doc_serializer: BaseDocSerializer,
doc: DoclingDocument,
**kwargs: Any,
) -> SerializationResult:
"""Serializes the passed item."""
params = MarkdownParams(**kwargs)
res_parts: list[SerializationResult] = []
cap_res = doc_serializer.serialize_captions(
item=item,
**kwargs,
)

if cap_res.text:
res_parts.append(cap_res)

if item.self_ref not in doc_serializer.get_excluded_refs():
if params.include_annotations:
ann_res = doc_serializer.serialize_annotations(
item=item,
**kwargs,
)
if ann_res.text:
res_parts.append(ann_res)

res_parts.append(create_ser_result(text=item.data.title, span_source=item))
res_parts.append(create_ser_result(text=item.data.kind, span_source=item))
if item.data.is_categorical and item.data.categories:
categories = ", ".join(item.data.categories)
if categories:
res_parts.append(
create_ser_result(
text=f"Categories: {categories}", span_source=item
)
)
if item.data.series:
series_text = ""
for series_name, series_data in item.data.series:
series_text += f"- {series_name}: {series_data}\n"
if series_text:
res_parts.append(
create_ser_result(text=series_text.strip(), span_source=item)
)

text_res = "\n\n".join([r.text for r in res_parts])

return create_ser_result(text=text_res, span_source=res_parts)


class MarkdownPictureSerializer(BasePictureSerializer):
"""Markdown-specific picture item serializer."""

Expand Down Expand Up @@ -621,6 +679,7 @@ class MarkdownDocSerializer(DocSerializer):

text_serializer: BaseTextSerializer = MarkdownTextSerializer()
table_serializer: BaseTableSerializer = MarkdownTableSerializer()
chart_serializer: BaseChartSerializer = MarkdownChartSerializer()
picture_serializer: BasePictureSerializer = MarkdownPictureSerializer()
key_value_serializer: BaseKeyValueSerializer = MarkdownKeyValueSerializer()
form_serializer: BaseFormSerializer = MarkdownFormSerializer()
Expand Down
2 changes: 2 additions & 0 deletions docling_core/types/doc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
AnyTableCell,
BaseAnnotation,
ChartBar,
ChartData,
ChartItem,
ChartLine,
ChartPoint,
ChartSlice,
Expand Down
Loading
Loading