Skip to content

Commit 0abd9ca

Browse files
committed
add deprecation, add first migration
Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
1 parent e4ae54d commit 0abd9ca

File tree

6 files changed

+280
-175
lines changed

6 files changed

+280
-175
lines changed

docling_core/types/doc/document.py

Lines changed: 76 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
Field,
2828
FieldSerializationInfo,
2929
StringConstraints,
30+
TypeAdapter,
31+
ValidationError,
3032
computed_field,
3133
field_serializer,
3234
field_validator,
@@ -941,39 +943,51 @@ class ContentLayer(str, Enum):
941943
DEFAULT_CONTENT_LAYERS = {ContentLayer.BODY}
942944

943945

944-
class BaseMeta(BaseModel):
945-
"""Base class for metadata."""
946+
class BasePrediction(BaseModel):
947+
"""Prediction field."""
946948

947-
model_config = ConfigDict(extra="allow")
949+
confidence: Optional[float] = None
950+
provenance: Optional[str] = None
951+
details: Optional[dict[str, Any]] = None
948952

953+
@field_serializer("confidence")
954+
def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
955+
return round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)
949956

950-
class SummaryInstance(BaseModel):
951-
"""Single summary data point."""
957+
958+
class SummaryMetaField(BasePrediction):
959+
"""Summary data."""
952960

953961
text: str
954-
confidence: Optional[float] = None
955-
provenance: Optional[str] = None
956962

957963

958-
class SummaryModel(BaseModel):
959-
"""Summary data."""
964+
class BaseMeta(BaseModel):
965+
"""Base class for metadata."""
966+
967+
model_config = ConfigDict(extra="allow")
968+
summary: Optional[SummaryMetaField] = None
969+
960970

961-
# convention: the first instance represents the main summary
962-
instances: List[SummaryInstance] = Field(default_factory=list, min_length=1)
963-
# NOTE: if needed, can add validator to coerce simpler forms to instances
971+
class PictureClassificationPrediction(BasePrediction):
972+
"""Picture classification instance."""
964973

974+
class_name: str
965975

966-
class CommonMeta(BaseMeta):
967-
"""Common metadata model."""
968976

969-
summary: Optional[SummaryModel] = None
977+
class PictureClassificationMetaField(BaseModel):
978+
"""Picture classification metadata field."""
979+
980+
predictions: list[PictureClassificationPrediction] = Field(
981+
default_factory=list, min_length=1
982+
)
970983

971984

972-
class PictureMeta(CommonMeta):
985+
class PictureMeta(BaseMeta):
973986
"""Picture metadata model."""
974987

988+
classification: Optional[PictureClassificationMetaField] = None
989+
975990
# TODO the previous classes include "kind" for disambiguation, which is not needed here
976-
classification: Optional[PictureClassificationData] = None
977991
molecule: Optional[PictureMoleculeData] = None
978992
tabular_chart: Optional[PictureTabularChartData] = None
979993
line_chart: Optional[PictureLineChartData] = None
@@ -983,13 +997,6 @@ class PictureMeta(CommonMeta):
983997
scatter_chart: Optional[PictureScatterChartData] = None
984998

985999

986-
class TableMeta(CommonMeta):
987-
"""Table metadata model."""
988-
989-
# TODO the previous classes include "kind" for disambiguation, which is not needed here
990-
description: Optional[DescriptionAnnotation] = None
991-
992-
9931000
class NodeItem(BaseModel):
9941001
"""NodeItem."""
9951002

@@ -1099,7 +1106,7 @@ def _add_sibling(
10991106
class GroupItem(NodeItem): # Container type, can't be a leaf node
11001107
"""GroupItem."""
11011108

1102-
meta: Optional[CommonMeta] = None
1109+
meta: Optional[BaseMeta] = None
11031110

11041111
name: str = (
11051112
"group" # Name of the group, e.g. "Introduction Chapter",
@@ -1151,7 +1158,7 @@ class DocItem(
11511158

11521159
label: DocItemLabel
11531160
prov: List[ProvenanceItem] = []
1154-
meta: Optional[CommonMeta] = None
1161+
meta: Optional[BaseMeta] = None
11551162

11561163
def get_location_tokens(
11571164
self,
@@ -1460,9 +1467,47 @@ class PictureItem(FloatingItem):
14601467
DocItemLabel.PICTURE
14611468
)
14621469

1463-
annotations: List[PictureDataType] = []
1470+
annotations: Annotated[
1471+
List[PictureDataType],
1472+
Field(deprecated="The `annotations` field is deprecated; use `meta` instead."),
1473+
] = []
14641474
meta: Optional[PictureMeta] = None
14651475

1476+
@model_validator(mode="before")
1477+
@classmethod
1478+
def migrate_annotations_to_meta(cls, data: Any) -> Any:
1479+
"""Migrate the `annotations` field to `meta`."""
1480+
if isinstance(data, dict) and (annotations := data.get("annotations")):
1481+
1482+
for raw_ann in annotations:
1483+
# migrate annotations to meta
1484+
try:
1485+
# Use Pydantic TypeAdapter to validate the annotation type according to the instruction.
1486+
1487+
ann: PictureDataType = TypeAdapter(PictureDataType).validate_python(
1488+
raw_ann
1489+
)
1490+
if isinstance(ann, PictureClassificationData):
1491+
# ensure meta field is present
1492+
data.setdefault("meta", {})
1493+
data["meta"].setdefault(
1494+
"classification",
1495+
PictureClassificationMetaField(
1496+
predictions=[
1497+
PictureClassificationPrediction(
1498+
class_name=pred.class_name,
1499+
confidence=pred.confidence,
1500+
provenance=ann.provenance,
1501+
)
1502+
for pred in ann.predicted_classes
1503+
],
1504+
).model_dump(),
1505+
)
1506+
except ValidationError as e:
1507+
raise e
1508+
1509+
return data
1510+
14661511
# Convert the image to Base64
14671512
def _image_to_base64(self, pil_image, format="PNG"):
14681513
"""Base64 representation of the image."""
@@ -1609,8 +1654,10 @@ class TableItem(FloatingItem):
16091654
DocItemLabel.TABLE,
16101655
] = DocItemLabel.TABLE
16111656

1612-
annotations: List[TableAnnotationType] = []
1613-
meta: Optional[TableMeta] = None
1657+
annotations: Annotated[
1658+
List[TableAnnotationType],
1659+
deprecated("The `annotations` field is deprecated; use `meta` instead."),
1660+
] = []
16141661

16151662
def export_to_dataframe(
16161663
self, doc: Optional["DoclingDocument"] = None

0 commit comments

Comments
 (0)