Skip to content

Commit 33e2f68

Browse files
committed
extend annotations migration
Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
1 parent 83b1948 commit 33e2f68

File tree

9 files changed

+259
-505
lines changed

9 files changed

+259
-505
lines changed

docling_core/types/doc/document.py

Lines changed: 61 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -943,12 +943,27 @@ class ContentLayer(str, Enum):
943943
DEFAULT_CONTENT_LAYERS = {ContentLayer.BODY}
944944

945945

946-
class BasePrediction(BaseModel):
946+
class ExtraAllowingModel(BaseModel):
947+
"""Base model allowing extra fields."""
948+
949+
model_config = ConfigDict(extra="allow")
950+
951+
def _get_extra_dict(self) -> dict[str, Any]:
952+
"""Get the extra fields as a dictionary."""
953+
return self.__pydantic_extra__ or {}
954+
955+
def _copy_without_extra(self) -> Self:
956+
"""Create a copy without the extra fields."""
957+
return self.model_validate(
958+
self.model_dump(exclude={ex for ex in self._get_extra_dict()})
959+
)
960+
961+
962+
class BasePrediction(ExtraAllowingModel):
947963
"""Prediction field."""
948964

949965
confidence: Optional[float] = None
950966
provenance: Optional[str] = None
951-
details: Optional[dict[str, Any]] = None
952967

953968
@field_serializer("confidence")
954969
def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
@@ -961,10 +976,9 @@ class SummaryMetaField(BasePrediction):
961976
text: str
962977

963978

964-
class BaseMeta(BaseModel):
979+
class BaseMeta(ExtraAllowingModel):
965980
"""Base class for metadata."""
966981

967-
model_config = ConfigDict(extra="allow")
968982
summary: Optional[SummaryMetaField] = None
969983

970984

@@ -974,7 +988,7 @@ class PictureClassificationPrediction(BasePrediction):
974988
class_name: str
975989

976990

977-
class PictureClassificationMetaField(BaseModel):
991+
class PictureClassificationMetaField(ExtraAllowingModel):
978992
"""Picture classification metadata field."""
979993

980994
predictions: list[PictureClassificationPrediction] = Field(
@@ -1469,7 +1483,7 @@ class PictureItem(FloatingItem):
14691483

14701484
annotations: Annotated[
14711485
List[PictureDataType],
1472-
Field(deprecated="The `annotations` field is deprecated; use `meta` instead."),
1486+
deprecated("Field `annotations` is deprecated; use `meta` instead."),
14731487
] = []
14741488
meta: Optional[PictureMeta] = None
14751489

@@ -1478,34 +1492,55 @@ class PictureItem(FloatingItem):
14781492
def migrate_annotations_to_meta(cls, data: Any) -> Any:
14791493
"""Migrate the `annotations` field to `meta`."""
14801494
if isinstance(data, dict) and (annotations := data.get("annotations")):
1481-
1495+
_logger.warning(
1496+
"Migrating deprecated `annotations` to `meta`; this will be removed in the future. "
1497+
"Note that only the first available instance of each annotation type will be migrated."
1498+
)
14821499
for raw_ann in annotations:
14831500
# migrate annotations to meta
1484-
try:
1485-
# Use Pydantic TypeAdapter to validate the annotation type according to the instruction.
14861501

1502+
try:
14871503
ann: PictureDataType = TypeAdapter(PictureDataType).validate_python(
14881504
raw_ann
14891505
)
1490-
if isinstance(ann, PictureClassificationData):
1491-
# ensure meta field is present
1492-
data.setdefault("meta", {})
1493-
data["meta"].setdefault(
1494-
"classification",
1495-
PictureClassificationMetaField(
1496-
predictions=[
1497-
PictureClassificationPrediction(
1498-
class_name=pred.class_name,
1499-
confidence=pred.confidence,
1500-
provenance=ann.provenance,
1501-
)
1502-
for pred in ann.predicted_classes
1503-
],
1504-
).model_dump(),
1505-
)
15061506
except ValidationError as e:
15071507
raise e
15081508

1509+
# ensure meta field is present
1510+
data.setdefault("meta", {})
1511+
1512+
if isinstance(ann, PictureClassificationData):
1513+
data["meta"].setdefault(
1514+
"classification",
1515+
PictureClassificationMetaField(
1516+
predictions=[
1517+
PictureClassificationPrediction(
1518+
class_name=pred.class_name,
1519+
confidence=pred.confidence,
1520+
provenance=ann.provenance,
1521+
)
1522+
for pred in ann.predicted_classes
1523+
],
1524+
).model_dump(mode="json"),
1525+
)
1526+
# migrate description annotation to summary meta field
1527+
elif isinstance(ann, DescriptionAnnotation):
1528+
data["meta"].setdefault(
1529+
"summary",
1530+
SummaryMetaField(
1531+
text=ann.text,
1532+
provenance=ann.provenance,
1533+
).model_dump(mode="json"),
1534+
)
1535+
# TODO add other relevant annotation types...
1536+
else:
1537+
# fall back to reusing (namespaced) original annotation type name
1538+
data["meta"].setdefault(
1539+
f"docling_internal_{ann.kind}",
1540+
ann.model_dump(mode="json"),
1541+
)
1542+
# TODO: add other annotation types to meta
1543+
15091544
return data
15101545

15111546
# Convert the image to Base64
@@ -1656,7 +1691,7 @@ class TableItem(FloatingItem):
16561691

16571692
annotations: Annotated[
16581693
List[TableAnnotationType],
1659-
deprecated("The `annotations` field is deprecated; use `meta` instead."),
1694+
deprecated("Field `annotations` is deprecated; use `meta` instead."),
16601695
] = []
16611696

16621697
def export_to_dataframe(

docs/DoclingDocument.json

Lines changed: 3 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1417,6 +1417,7 @@
14171417
"type": "object"
14181418
},
14191419
"PictureClassificationMetaField": {
1420+
"additionalProperties": true,
14201421
"description": "Picture classification metadata field.",
14211422
"properties": {
14221423
"predictions": {
@@ -1432,6 +1433,7 @@
14321433
"type": "object"
14331434
},
14341435
"PictureClassificationPrediction": {
1436+
"additionalProperties": true,
14351437
"description": "Picture classification instance.",
14361438
"properties": {
14371439
"confidence": {
@@ -1458,19 +1460,6 @@
14581460
"default": null,
14591461
"title": "Provenance"
14601462
},
1461-
"details": {
1462-
"anyOf": [
1463-
{
1464-
"additionalProperties": true,
1465-
"type": "object"
1466-
},
1467-
{
1468-
"type": "null"
1469-
}
1470-
],
1471-
"default": null,
1472-
"title": "Details"
1473-
},
14741463
"class_name": {
14751464
"title": "Class Name",
14761465
"type": "string"
@@ -2230,6 +2219,7 @@
22302219
"type": "object"
22312220
},
22322221
"SummaryMetaField": {
2222+
"additionalProperties": true,
22332223
"description": "Summary data.",
22342224
"properties": {
22352225
"confidence": {
@@ -2256,19 +2246,6 @@
22562246
"default": null,
22572247
"title": "Provenance"
22582248
},
2259-
"details": {
2260-
"anyOf": [
2261-
{
2262-
"additionalProperties": true,
2263-
"type": "object"
2264-
},
2265-
{
2266-
"type": "null"
2267-
}
2268-
],
2269-
"default": null,
2270-
"title": "Details"
2271-
},
22722249
"text": {
22732250
"title": "Text",
22742251
"type": "string"

0 commit comments

Comments
 (0)