From 26ba5a6e7cc2ae843ee5be3b85f66e9bb14ff5e1 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Wed, 24 Sep 2025 08:52:35 +0200
Subject: [PATCH] added the Azure Document Intelligence

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling_core/transforms/serializer/azure.py   | 480 ++++++++++++++++++
 .../transforms/serializer/markdown.py         |   5 -
 test/test_azure_serializer.py                 |  91 ++++
 3 files changed, 571 insertions(+), 5 deletions(-)
 create mode 100644 docling_core/transforms/serializer/azure.py
 create mode 100644 test/test_azure_serializer.py

diff --git a/docling_core/transforms/serializer/azure.py b/docling_core/transforms/serializer/azure.py
new file mode 100644
index 00000000..1844ac67
--- /dev/null
+++ b/docling_core/transforms/serializer/azure.py
@@ -0,0 +1,480 @@
+"""Define classes for Azure serialization.
+
+This serializer exports a DoclingDocument to a JSON structure that mirrors
+the Azure Document Intelligence layout output used in
+`azure_document_intelligence.convert_azure_output_to_docling`.
+
+It traverses the document similarly to the HTML/Markdown serializers but
+accumulates structured JSON for:
+- pages (number, width, height; words omitted by default)
+- tables (with bounding regions and cells)
+- figures (with bounding regions and optional footnotes)
+- paragraphs (with optional Azure roles)
+
+Notes:
+- Word-level segmentation is not available in the DoclingDocument, so the
+  exported `pages[*].words` array is left empty.
+- Bounding boxes are normalized to TOPLEFT origin when page size is known.
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Any, Dict, Optional
+
+from pydantic import BaseModel, Field
+from typing_extensions import override
+
+from docling_core.transforms.serializer.base import (
+    BaseDocSerializer,
+    BaseFallbackSerializer,
+    BaseFormSerializer,
+    BaseInlineSerializer,
+    BaseKeyValueSerializer,
+    BaseListSerializer,
+    BasePictureSerializer,
+    BaseTableSerializer,
+    BaseTextSerializer,
+    SerializationResult,
+)
+from docling_core.transforms.serializer.common import (
+    CommonParams,
+    DocSerializer,
+    create_ser_result,
+)
+from docling_core.types.doc.base import CoordOrigin
+from docling_core.types.doc.document import (
+    DocItem,
+    DoclingDocument,
+    FloatingItem,
+    FormItem,
+    GroupItem,
+    InlineGroup,
+    KeyValueItem,
+    ListGroup,
+    ListItem,
+    NodeItem,
+    PictureItem,
+    RefItem,
+    RichTableCell,
+    TableItem,
+    TextItem,
+)
+from docling_core.types.doc.labels import DocItemLabel
+
+
+def _bbox_to_polygon_coords(
+    *,
+    l: float,
+    t: float,
+    r: float,
+    b: float,
+) -> list[float]:
+    """Create a flat polygon list [x1,y1, x2,y2, x3,y3, x4,y4] from bbox."""
+    # Order: top-left, top-right, bottom-right, bottom-left
+    return [l, t, r, t, r, b, l, b]
+
+
+def _bbox_to_polygon_for_item(doc: DoclingDocument, item: DocItem) -> Optional[list[float]]:
+    """Compute a TOPLEFT-origin polygon for the first provenance of the item."""
+    if not item.prov:
+        return None
+
+    prov = item.prov[0]
+    page_no = prov.page_no
+    bbox = prov.bbox
+    if bbox is None:
+        return None
+
+    # Normalize to TOPLEFT origin when page height is known
+    if page_no in doc.pages and doc.pages[page_no].size is not None:
+        page_h = doc.pages[page_no].size.height
+        if bbox.coord_origin != CoordOrigin.TOPLEFT:
+            bbox = bbox.to_top_left_origin(page_height=page_h)
+
+    l, t, r, b = bbox.l, bbox.t, bbox.r, bbox.b
+    return _bbox_to_polygon_coords(l=l, t=t, r=r, b=b)
+
+
+class AzureParams(CommonParams):
+    """Azure-specific serialization parameters.
+
+    - include_words: whether to export page words (not supported; kept for future).
+    """
+
+    include_words: bool = False
+
+
+class _AzureTextSerializer(BaseModel, BaseTextSerializer):
+    """Serializer that collects paragraphs with optional roles."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: TextItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        is_inline_scope: bool = False,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        assert isinstance(doc_serializer, AzureDocSerializer)
+
+        # Lists may be represented either as TextItem(ListItem) or via groups;
+        # we treat any TextItem as a paragraph-like entry.
+        if item.prov:
+            prov = item.prov[0]
+            page_no = prov.page_no
+            polygon = _bbox_to_polygon_for_item(doc, item)
+        else:
+            page_no = 1
+            polygon = None
+
+        role: Optional[str] = None
+        if item.label == DocItemLabel.TITLE:
+            role = "title"
+        elif item.label == DocItemLabel.SECTION_HEADER:
+            role = "sectionHeading"
+        elif item.label == DocItemLabel.FOOTNOTE:
+            role = "footnote"
+        elif item.label == DocItemLabel.PAGE_HEADER:
+            role = "pageHeader"
+        elif item.label == DocItemLabel.PAGE_FOOTER:
+            role = "pageFooter"
+        # Other labels map to regular paragraphs without a specific role
+
+        content = item.text
+
+        if content != "" and polygon is not None:
+            para: Dict[str, Any] = {
+                "content": content,
+                "boundingRegions": [
+                    {
+                        "pageNumber": page_no,
+                        "polygon": polygon,
+                    }
+                ],
+            }
+            if role is not None:
+                para["role"] = role
+
+            doc_serializer.azure.setdefault("paragraphs", []).append(para)
+
+        # Nothing to emit as text; we just filled the accumulator
+        return create_ser_result()
+
+
+class _AzureTableSerializer(BaseTableSerializer):
+    """Serializer that collects tables with cell metadata."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: TableItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        assert isinstance(doc_serializer, AzureDocSerializer)
+
+        if not item.prov:
+            return create_ser_result()
+
+        prov = item.prov[0]
+        page_no = prov.page_no
+        poly = _bbox_to_polygon_for_item(doc, item)
+        if poly is None:
+            return create_ser_result()
+
+        table_obj: Dict[str, Any] = {
+            "rowCount": item.data.num_rows,
+            "columnCount": item.data.num_cols,
+            "boundingRegions": [
+                {
+                    "pageNumber": page_no,
+                    "polygon": poly,
+                }
+            ],
+            "cells": [],
+        }
+
+        # Serialize cells from the computed grid
+        for i, row in enumerate(item.data.grid):
+            for j, cell in enumerate(row):
+                # Only materialize each spanning cell once at its anchor position
+                if (
+                    i != cell.start_row_offset_idx
+                    or j != cell.start_col_offset_idx
+                ):
+                    continue
+
+                # For RichTableCell, get textual content via helper
+                if isinstance(cell, RichTableCell):
+                    content_text = cell._get_text(doc=doc, doc_serializer=doc_serializer)
+                else:
+                    content_text = cell.text
+
+                cell_poly: Optional[list[float]] = None
+                if cell.bbox is not None:
+                    # Normalize cell bbox to TOPLEFT origin
+                    bbox = cell.bbox
+                    if page_no in doc.pages and doc.pages[page_no].size is not None:
+                        page_h = doc.pages[page_no].size.height
+                        if bbox.coord_origin != CoordOrigin.TOPLEFT:
+                            bbox = bbox.to_top_left_origin(page_height=page_h)
+                    cell_poly = _bbox_to_polygon_coords(
+                        l=bbox.l, t=bbox.t, r=bbox.r, b=bbox.b
+                    )
+
+                cell_obj: Dict[str, Any] = {
+                    "content": content_text.strip(),
+                    "rowIndex": cell.start_row_offset_idx,
+                    "columnIndex": cell.start_col_offset_idx,
+                    "rowSpan": max(cell.row_span, 1),
+                    "colSpan": max(cell.col_span, 1),
+                }
+                if cell.column_header:
+                    cell_obj["kind"] = "columnHeader"
+                elif cell.row_header:
+                    cell_obj["kind"] = "rowHeader"
+
+                if cell_poly is not None:
+                    cell_obj["boundingRegions"] = [
+                        {
+                            "pageNumber": page_no,
+                            "polygon": cell_poly,
+                        }
+                    ]
+
+                table_obj["cells"].append(cell_obj)
+
+        doc_serializer.azure.setdefault("tables", []).append(table_obj)
+        return create_ser_result()
+
+
+class _AzurePictureSerializer(BasePictureSerializer):
+    """Serializer that collects figures with optional footnotes."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: PictureItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        assert isinstance(doc_serializer, AzureDocSerializer)
+
+        if not item.prov:
+            return create_ser_result()
+
+        prov = item.prov[0]
+        page_no = prov.page_no
+        poly = _bbox_to_polygon_for_item(doc, item)
+        if poly is None:
+            return create_ser_result()
+
+        fig_obj: Dict[str, Any] = {
+            "boundingRegions": [
+                {
+                    "pageNumber": page_no,
+                    "polygon": poly,
+                }
+            ]
+        }
+
+        # Include picture footnotes if present
+        foots = []
+        for foot_ref in item.footnotes:
+            if isinstance(foot_ref, RefItem):
+                tgt = foot_ref.resolve(doc)
+                if isinstance(tgt, TextItem) and tgt.prov:
+                    f_poly = _bbox_to_polygon_for_item(doc, tgt)
+                    if f_poly is not None:
+                        foots.append(
+                            {
+                                "content": tgt.text,
+                                "boundingRegions": [
+                                    {
+                                        "pageNumber": tgt.prov[0].page_no,
+                                        "polygon": f_poly,
+                                    }
+                                ],
+                            }
+                        )
+
+        if foots:
+            fig_obj["footnotes"] = foots
+
+        doc_serializer.azure.setdefault("figures", []).append(fig_obj)
+        return create_ser_result()
+
+
+class _AzureKeyValueSerializer(BaseKeyValueSerializer):
+    """No-op for Azure output (not represented)."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: KeyValueItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        # Azure JSON we target does not include KeyValue/Form regions; ignore.
+        _ = (item, doc_serializer, doc, kwargs)
+        return create_ser_result()
+
+
+class _AzureFormSerializer(BaseFormSerializer):
+    """No-op for Azure output (not represented)."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: FormItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        _ = (item, doc_serializer, doc, kwargs)
+        return create_ser_result()
+
+
+class _AzureListSerializer(BaseModel, BaseListSerializer):
+    """Lists are flattened via their TextItem children; no direct output."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: ListGroup,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        list_level: int = 0,
+        is_inline_scope: bool = False,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        # Do not recurse here; the outer traversal in DocSerializer.get_parts
+        # will visit children already. We emit no direct list structure.
+        _ = (doc, list_level, is_inline_scope, item, doc_serializer, kwargs)
+        return create_ser_result()
+
+
+class _AzureInlineSerializer(BaseInlineSerializer):
+    """Inline groups are flattened; no direct output."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: InlineGroup,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        list_level: int = 0,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        _ = (doc, list_level, item, doc_serializer, kwargs)
+        return create_ser_result()
+
+
+class _AzureFallbackSerializer(BaseFallbackSerializer):
+    """Fallback for groups; triggers traversal only."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: NodeItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        # No recursion; outer traversal covers children already.
+        _ = (item, doc_serializer, doc, kwargs)
+        return create_ser_result()
+
+
+class AzureDocSerializer(DocSerializer):
+    """Azure-specific document serializer.
+
+    Produces a JSON string compatible with the inverse mapping in
+    `azure_document_intelligence.convert_azure_output_to_docling`.
+    """
+
+    text_serializer: BaseTextSerializer = _AzureTextSerializer()
+    table_serializer: BaseTableSerializer = _AzureTableSerializer()
+    picture_serializer: BasePictureSerializer = _AzurePictureSerializer()
+    key_value_serializer: BaseKeyValueSerializer = _AzureKeyValueSerializer()
+    form_serializer: BaseFormSerializer = _AzureFormSerializer()
+    fallback_serializer: BaseFallbackSerializer = _AzureFallbackSerializer()
+
+    list_serializer: BaseListSerializer = _AzureListSerializer()
+    inline_serializer: BaseInlineSerializer = _AzureInlineSerializer()
+
+    params: AzureParams = AzureParams()
+
+    # Accumulator for the Azure-like output
+    azure: Dict[str, Any] = Field(default_factory=dict)
+
+    @override
+    def serialize_doc(
+        self,
+        *,
+        parts: list[SerializationResult],  # not used; traversal already filled state
+        **kwargs: Any,
+    ) -> SerializationResult:
+        # Initialize accumulator if not present
+        if not self.azure:
+            self.azure = {"pages": [], "tables": [], "figures": [], "paragraphs": []}
+
+        # Pages: export number/size; words omitted by default
+        # Keep original order by page number
+        for page_no in sorted(self.doc.pages.keys()):
+            page = self.doc.pages[page_no]
+            if page.size is not None:
+                self.azure["pages"].append(
+                    {
+                        "pageNumber": page_no,
+                        "width": page.size.width,
+                        "height": page.size.height,
+                        "words": [],
+                    }
+                )
+
+        # Convert accumulated structure to compact JSON string
+        json_text = json.dumps(self.azure, ensure_ascii=False)
+        return create_ser_result(text=json_text, span_source=parts)
+
+    # Formatting/hyperlink hooks are no-ops for JSON output
+    @override
+    def serialize_bold(self, text: str, **kwargs: Any) -> str:
+        return text
+
+    @override
+    def serialize_italic(self, text: str, **kwargs: Any) -> str:
+        return text
+
+    @override
+    def serialize_underline(self, text: str, **kwargs: Any) -> str:
+        return text
+
+    @override
+    def serialize_strikethrough(self, text: str, **kwargs: Any) -> str:
+        return text
+
+    @override
+    def serialize_subscript(self, text: str, **kwargs: Any) -> str:
+        return text
+
+    @override
+    def serialize_superscript(self, text: str, **kwargs: Any) -> str:
+        return text
+
+    @override
+    def serialize_hyperlink(self, text: str, hyperlink, **kwargs: Any) -> str:
+        return text
diff --git a/docling_core/transforms/serializer/markdown.py b/docling_core/transforms/serializer/markdown.py
index 3598c135..ee1d7d6d 100644
--- a/docling_core/transforms/serializer/markdown.py
+++ b/docling_core/transforms/serializer/markdown.py
@@ -1,8 +1,3 @@
-#
-# Copyright IBM Corp. 2024 - 2025
-# SPDX-License-Identifier: MIT
-#
-
 """Define classes for Markdown serialization."""
 import html
 import re
diff --git a/test/test_azure_serializer.py b/test/test_azure_serializer.py
new file mode 100644
index 00000000..a058e0ca
--- /dev/null
+++ b/test/test_azure_serializer.py
@@ -0,0 +1,91 @@
+"""Tests for AzureDocSerializer."""
+
+import json
+import os
+from pathlib import Path
+
+from docling_core.transforms.serializer.azure import AzureDocSerializer
+from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size
+from docling_core.types.doc.document import DocItemLabel, DoclingDocument, ProvenanceItem
+
+from .test_data_gen_flag import GEN_TEST_DATA
+from .test_docling_doc import _construct_doc
+
+
+def _verify_json(exp_file: Path, actual_json: str) -> None:
+    """Verify Azure JSON string against ground-truth file with generation support."""
+    if GEN_TEST_DATA or not exp_file.exists():
+        exp_file.write_text(actual_json + "\n", encoding="utf-8")
+    else:
+        expected = exp_file.read_text(encoding="utf-8").rstrip()
+        assert expected == actual_json
+
+
+def test_azure_serialize_activities_doc():
+    """Serialize a GT document (activities.json) and verify Azure JSON output."""
+    src = Path("./test/data/doc/activities.json")
+    doc = DoclingDocument.load_from_json(src)
+
+    ser = AzureDocSerializer(doc=doc)
+    actual_json = ser.serialize().text
+
+    # Sanity-check the JSON structure
+    data = json.loads(actual_json)
+    assert isinstance(data, dict)
+    assert "pages" in data and isinstance(data["pages"], list)
+    assert "tables" in data and isinstance(data["tables"], list)
+    assert "figures" in data and isinstance(data["figures"], list)
+    assert "paragraphs" in data and isinstance(data["paragraphs"], list)
+
+    _verify_json(exp_file=src.with_suffix(".gt.azure.json"), actual_json=actual_json)
+
+
+def test_azure_serialize_construct_doc_minimal_prov():
+    """Serialize a constructed document with minimal provenance to Azure JSON.
+
+    The _construct_doc() builder does not attach provenance or pages; here we add a
+    single page and minimal bounding boxes to a subset of items to allow Azure JSON
+    output to include paragraphs/tables/pictures with boundingRegions.
+    """
+    doc = _construct_doc()
+
+    # Ensure at least one page is present
+    if not doc.pages:
+        doc.add_page(page_no=1, size=Size(width=600.0, height=800.0), image=None)
+
+    # Helper to add a simple TOPLEFT bbox provenance if missing
+    def _ensure_prov(item, l=10.0, t=10.0, r=200.0, b=40.0):
+        if not item.prov:
+            item.prov = [
+                ProvenanceItem(
+                    page_no=min(doc.pages.keys()),
+                    bbox=BoundingBox(l=l, t=t, r=r, b=b, coord_origin=CoordOrigin.TOPLEFT),
+                    charspan=(0, 0),
+                )
+            ]
+
+    # Add provenance for the title and a couple of paragraphs if present
+    for it in doc.texts[:3]:
+        if it.label in {DocItemLabel.TITLE, DocItemLabel.TEXT, DocItemLabel.SECTION_HEADER}:
+            _ensure_prov(it)
+
+    # Add provenance for the first table if present
+    if doc.tables:
+        _ensure_prov(doc.tables[0], l=20.0, t=80.0, r=300.0, b=200.0)
+
+    # Add provenance for the first picture if present
+    if doc.pictures:
+        _ensure_prov(doc.pictures[0], l=320.0, t=80.0, r=500.0, b=220.0)
+
+    ser = AzureDocSerializer(doc=doc)
+    actual_json = ser.serialize().text
+
+    # Basic structure check
+    data = json.loads(actual_json)
+    assert isinstance(data, dict)
+    assert "pages" in data and isinstance(data["pages"], list) and len(data["pages"]) >= 1
+    assert "paragraphs" in data and isinstance(data["paragraphs"], list)
+
+    exp_file = Path("./test/data/doc/constructed_doc.gt.azure.json")
+    _verify_json(exp_file=exp_file, actual_json=actual_json)
+