From c2ba4c38370c60da407cb2365bc64696e0cc3631 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Sat, 9 Aug 2025 20:35:05 -0400 Subject: [PATCH 01/14] promoted `get()` with default value from `Mmif` to `MmifObject` (prime superclass) --- mmif/serialize/mmif.py | 19 ------------------- mmif/serialize/model.py | 19 +++++++++++++++++++ 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/mmif/serialize/mmif.py b/mmif/serialize/mmif.py index 2715bae8..efec7348 100644 --- a/mmif/serialize/mmif.py +++ b/mmif/serialize/mmif.py @@ -794,22 +794,3 @@ def __getitem__(self, item: str) \ return self.views.__getitem__(item) except KeyError: raise KeyError(f"Object with ID {item} not found in the MMIF object. ") - - def get(self, obj_id, default=None): - """ - High-level getter for Mmif. This will try to find any object, given - an identifier or an immediate attribute name. When nothing is found, - this will return a default value instead of raising an error. - - :param obj_id: an immediate attribute name or an object identifier - (a document ID, a view ID, or an annotation ID). When - annotation ID is given as a "short" ID (without view - ID prefix), the method will try to find a match from - the first view, and return immediately if found. - :param default: the default value to return if none is found - :return: the object searched for or the default value - """ - try: - return self.__getitem__(obj_id) - except KeyError: - return default diff --git a/mmif/serialize/model.py b/mmif/serialize/model.py index 70ff9e25..9c232a81 100644 --- a/mmif/serialize/model.py +++ b/mmif/serialize/model.py @@ -314,6 +314,25 @@ def __getitem__(self, key) -> Union['MmifObject', str, datetime]: else: return value + def get(self, obj_id, default=None): + """ + High-level getter for Mmif. This will try to find any object, given + an identifier or an immediate attribute name. When nothing is found, + this will return a default value instead of raising an error. + + :param obj_id: an immediate attribute name or an object identifier + (a document ID, a view ID, or an annotation ID). When + annotation ID is given as a "short" ID (without view + ID prefix), the method will try to find a match from + the first view, and return immediately if found. + :param default: the default value to return if none is found + :return: the object searched for or the default value + """ + try: + return self.__getitem__(obj_id) + except KeyError: + return default + class MmifObjectEncoder(json.JSONEncoder): """ From 6f95b04cec7273cf493d3985237b468a75b9ff24 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Sun, 17 Aug 2025 20:33:57 -0400 Subject: [PATCH 02/14] deprecated `get()` from list-like objs, separate key-index, int-index ... * added `get_by_key()` for string queries * `__getitem__()` now only works with integer/slice indexing --- mmif/serialize/mmif.py | 37 +++++++++++++++++++-------------- mmif/serialize/model.py | 35 ++++++++++++++++++++++--------- mmif/serialize/view.py | 4 ++-- tests/test_serialize.py | 46 ++++++++++++++++++----------------------- 4 files changed, 69 insertions(+), 53 deletions(-) diff --git a/mmif/serialize/mmif.py b/mmif/serialize/mmif.py index efec7348..a4c4512a 100644 --- a/mmif/serialize/mmif.py +++ b/mmif/serialize/mmif.py @@ -266,8 +266,8 @@ def _when_failed(): ## caching alignments if all(map(lambda x: x in alignment_ann.properties, ('source', 'target'))): try: - source_ann = self[alignment_ann.get('source')] - target_ann = self[alignment_ann.get('target')] + source_ann = self.__getitem__(alignment_ann.get('source')) + target_ann = self.__getitem__(alignment_ann.get('target')) if isinstance(source_ann, Annotation) and isinstance(target_ann, Annotation): source_ann._cache_alignment(alignment_ann, target_ann) target_ann._cache_alignment(alignment_ann, source_ann) @@ -586,7 +586,7 @@ def get_alignments(self, at_type1: Union[str, ThingTypesBase], at_type2: Union[s aligned_types = set() for ann_id in [alignment['target'], alignment['source']]: ann_id = cast(str, ann_id) - aligned_type = cast(Annotation, self[ann_id]).at_type + aligned_type = cast(Annotation, self.__getitem__(ann_id)).at_type aligned_types.add(aligned_type) aligned_types = list(aligned_types) # because membership check for sets also checks hash() values if len(aligned_types) == 2 and at_type1 in aligned_types and at_type2 in aligned_types: @@ -745,10 +745,10 @@ def _get_linear_anchor_point(self, ann: Annotation, targets_sorted=False, start: point = math.inf if start else -1 comp = min if start else max for target_id in ann.get_property('targets'): - point = comp(point, self._get_linear_anchor_point(self[target_id], start=start)) + point = comp(point, self._get_linear_anchor_point(self.__getitem__(target_id), start=start)) return point target_id = ann.get_property('targets')[0 if start else -1] - return self._get_linear_anchor_point(self[target_id], start=start) + return self._get_linear_anchor_point(self.__getitem__(target_id), start=start) elif (start and 'start' in props) or (not start and 'end' in props): return ann.get_property('start' if start else 'end') else: @@ -766,8 +766,7 @@ def get_end(self, annotation: Annotation) -> Union[int, float]: """ return self._get_linear_anchor_point(annotation, start=False) - def __getitem__(self, item: str) \ - -> Union[Document, View, Annotation, MmifMetadata, DocumentsList, ViewsList]: + def __getitem__(self, item: str) -> Union[Document, View, Annotation, MmifMetadata]: """ index ([]) implementation for Mmif. This will try to find any object, given an identifier or an immediate attribute name. When nothing is found, this will raise an error rather than returning a None @@ -780,17 +779,25 @@ def __getitem__(self, item: str) \ :raise KeyError: if the item is not found or multiple objects are found with the same ID """ if item in self._named_attributes(): - return self.__dict__[item] + return self.__dict__.__getitem__(item) if self.id_delimiter in item: vid, _ = item.split(self.id_delimiter, 1) - return self.views[vid].annotations[item] + view = self.views.get_by_key(vid) + if view is None: + raise KeyError(f"View with ID {vid} not found in the MMIF object.") + ann = view.annotations.get_by_key(item) + if ann is None: + raise KeyError(f"Annotation with ID {item} not found in the MMIF object.") + return ann else: # search for document first, then views # raise KeyError if nothing is found - try: - return self.documents.__getitem__(item) - except KeyError: - try: - return self.views.__getitem__(item) - except KeyError: + ret = self.documents.get_by_key(item) + if ret is None: + ret = self.views.get_by_key(item) + if ret is None: raise KeyError(f"Object with ID {item} not found in the MMIF object. ") + else: + return ret + else: + return ret diff --git a/mmif/serialize/model.py b/mmif/serialize/model.py index 9c232a81..671ff228 100644 --- a/mmif/serialize/model.py +++ b/mmif/serialize/model.py @@ -13,8 +13,9 @@ """ import json +import warnings from datetime import datetime -from typing import Union, Any, Dict, Optional, TypeVar, Generic, Generator, Iterator, Type, Set, ClassVar +from typing import Union, Any, Dict, Optional, TypeVar, Generic, Generator, Iterator, Type, Set, ClassVar, List from deepdiff import DeepDiff @@ -302,7 +303,7 @@ def __contains__(self, key: str) -> bool: except (TypeError, AttributeError, KeyError): return False - def __getitem__(self, key) -> Union['MmifObject', str, datetime]: + def __getitem__(self, key) -> Any: if key in self._named_attributes(): value = self.__dict__[key] elif self._unnamed_attributes is None: @@ -394,6 +395,11 @@ def _deserialize(self, input_list: list) -> None: def get(self, key: str, default=None) -> Optional[T]: """ + .. deprecated:: 1.1.3 + Will be removed in 2.0.0. + Use `__getitem__` for integer access or `get_by_key` for string-based + ID or attribute access. + Standard dictionary-style get() method, albeit with no ``default`` parameter. Relies on the implementation of __getitem__. @@ -403,10 +409,17 @@ def get(self, key: str, default=None) -> Optional[T]: :param default: the default value to return if the key is not found :return: the value matching that key """ - try: - return self[key] - except KeyError: - return default + warnings.warn( + "The 'get' method is deprecated and will be removed in a future version. " + "Use `__getitem__` for integer access or `get_by_key` for string-based " \ + "ID or attribute access.", + DeprecationWarning, + stacklevel=2 + ) + return self.get_by_key(key, default) + + def get_by_key(self, key: str, default=None) -> Optional[T]: + return self._items.get(key, default) def _append_with_key(self, key: str, value: T, overwrite=False) -> None: """ @@ -431,11 +444,13 @@ def _append_with_key(self, key: str, value: T, overwrite=False) -> None: def append(self, value, overwrite): raise NotImplementedError() - def __getitem__(self, key: str) -> T: - if key not in self.reserved_names: - return self._items.__getitem__(key) + def __getitem__(self, key: Union[int, slice]) -> Union[T, List[T]]: + if isinstance(key, (int, slice)): + # Python's dicts preserve insertion order since 3.7. + # We can convert values to a list and index it. + return list(self._items.values())[key] else: - raise KeyError("Don't use __getitem__ to access a reserved name") + raise TypeError(f"list indices must be integers or slices, not {type(key).__name__}") def __setitem__(self, key: str, value: T): if key not in self.reserved_names: diff --git a/mmif/serialize/view.py b/mmif/serialize/view.py index d8821c5b..82a624dc 100644 --- a/mmif/serialize/view.py +++ b/mmif/serialize/view.py @@ -287,7 +287,7 @@ def get_document_by_id(self, doc_id) -> Document: "View.get_document_by_id() is deprecated, use view[doc_id] instead.", DeprecationWarning ) - doc_found = self.annotations[doc_id] + doc_found = self.annotations.get_by_key(doc_id) if not isinstance(doc_found, Document): raise KeyError(f"Document \"{doc_id}\" not found in view {self.id}.") return cast(Document, doc_found) @@ -302,7 +302,7 @@ def __getitem__(self, key: str) -> 'Annotation': """ if key in self._named_attributes(): return self.__dict__[key] - anno_result = self.annotations.get(key) + anno_result = self.annotations.get_by_key(key) if not anno_result: raise KeyError("Annotation ID not found: %s" % key) return anno_result diff --git a/tests/test_serialize.py b/tests/test_serialize.py index bec77d28..86159185 100644 --- a/tests/test_serialize.py +++ b/tests/test_serialize.py @@ -802,14 +802,14 @@ def test_new_textdocument(self): self.assertTrue(td1.properties.text_value == td1.text_value) self.assertNotEqual(td1.text_language, td2.text_language) self.assertEqual(english_text, td1.text_value) - self.assertEqual(td1, self.view_obj.annotations.get(td1.id)) + self.assertEqual(td1, self.view_obj.annotations.get_by_key(td1.id)) td3 = self.view_obj.new_textdocument(english_text, mime='plain/text') self.assertEqual(td1.text_value, td3.text_value) self.assertEqual(len(td1.properties), len(td3.properties) - 1) def test_parent(self): mmif_obj = Mmif(self.mmif_examples_json['everything']) - self.assertTrue(all(anno.parent == v.id for v in mmif_obj.views for anno in mmif_obj.get_view_by_id(v.id).annotations)) + self.assertTrue(all(anno.parent == v.id for v in mmif_obj.views for anno in mmif_obj[v.id].annotations)) def test_non_existing_parent(self): anno_obj = Annotation(FRACTIONAL_EXAMPLES['doc_only']) @@ -820,20 +820,20 @@ def test_non_existing_parent(self): def test_get_by_id(self): mmif_obj = Mmif(MMIF_EXAMPLES['everything']) - mmif_obj['m1'] - mmif_obj['v4:td1'] + mmif_obj.__getitem__('m1') + mmif_obj.__getitem__('v4:td1') with self.assertRaises(KeyError): - mmif_obj['m55'] + mmif_obj.__getitem__('m55') with self.assertRaises(KeyError): - mmif_obj['v1:td1'] - view_obj = mmif_obj['v4'] - td1 = view_obj['v4:td1'] + mmif_obj.__getitem__('v1:td1') + view_obj = mmif_obj.__getitem__('v4') + td1 = view_obj.__getitem__('v4:td1') self.assertEqual(td1.properties.mime, 'text/plain') - a1 = view_obj['v4:a1'] + a1 = view_obj.__getitem__('v4:a1') self.assertEqual(a1.at_type, AnnotationTypes.Alignment) with self.assertRaises(KeyError): - view_obj['completely-unlikely-annotation-id'] - + view_obj.__getitem__('completely-unlikely-annotation-id') + def test_get_annotations(self): mmif_obj = Mmif(MMIF_EXAMPLES['everything']) # simple search by at_type @@ -913,7 +913,8 @@ def test_error_to_text(self): self.assertTrue(aview.has_error()) self.assertTrue(isinstance(mmif_obj.get_last_error(), str)) err_str = 'custom error as a single long string' - aview.metadata.error = err_str + aview.metadata.error = ErrorDict({'message': err_str}) + print(aview.metadata.error) self.assertTrue(aview.has_error()) self.assertTrue(isinstance(mmif_obj.get_last_error(), str)) self.assertIn(err_str, mmif_obj.get_last_error()) @@ -1039,7 +1040,7 @@ def test_add_property(self): removed_prop_key, removed_prop_value = list(props.items())[-1] props.pop(removed_prop_key) new_mmif = Mmif(datum['json']) - new_mmif.get_view_by_id(view_id).annotations[first_ann_id].add_property(removed_prop_key, removed_prop_value) + new_mmif.get(view_id).annotations[first_ann_id].add_property(removed_prop_key, removed_prop_value) self.assertEqual(json.loads(datum['string'])['views'][j], json.loads(new_mmif.serialize())['views'][j], f'Failed on {i}, {view_id}') @@ -1236,9 +1237,9 @@ def test_document_adding_duplicate_properties(self): doc1.add_property('publisher', 'they') self.assertEqual(2, len(doc1._props_pending)) mmif_roundtrip3 = Mmif(mmif_roundtrip2.serialize()) - r0_v_anns = list(mmif_roundtrip3.views[r0_vid].get_annotations(AnnotationTypes.Annotation)) - r1_v_anns = list(mmif_roundtrip3.views[r1_vid].get_annotations(AnnotationTypes.Annotation)) - r2_v_anns = list(mmif_roundtrip3.views[r2_vid].get_annotations(AnnotationTypes.Annotation)) + r0_v_anns = list(mmif_roundtrip3.views.get_by_key(r0_vid).get_annotations(AnnotationTypes.Annotation)) + r1_v_anns = list(mmif_roundtrip3.views.get_by_key(r1_vid).get_annotations(AnnotationTypes.Annotation)) + r2_v_anns = list(mmif_roundtrip3.views.get_by_key(r2_vid).get_annotations(AnnotationTypes.Annotation)) # two props (`author` and `publisher`) are serialized to one `Annotation` objects self.assertEqual(1, len(r0_v_anns)) self.assertEqual(0, len(r1_v_anns)) @@ -1324,7 +1325,7 @@ def test_capital_annotation_generation_viewfinder(self): mmif[f'doc{i+1}'].add_property('author', authors[i]) mmif_roundtrip = Mmif(mmif.serialize()) for i in range(1, 3): - cap_anns = list(mmif_roundtrip.views[f'v{i}'].get_annotations(AnnotationTypes.Annotation)) + cap_anns = list(mmif_roundtrip.views.get_by_key(f'v{i}').get_annotations(AnnotationTypes.Annotation)) self.assertEqual(1, len(cap_anns)) self.assertEqual(authors[i-1], cap_anns[0].get_property('author')) @@ -1391,7 +1392,7 @@ def test_add_property(self): properties.pop(removed_prop_key) try: new_mmif = Mmif(datum['json']) - new_mmif.get_document_by_id(document_id).add_property(removed_prop_key, removed_prop_value) + new_mmif.get(document_id).add_property(removed_prop_key, removed_prop_value) self.assertEqual(json.loads(datum['string']), json.loads(new_mmif.serialize()), f'Failed on {i}, {document_id}') except ValidationError: continue @@ -1406,13 +1407,6 @@ def test_setitem(self): self.datalist['v1'] = View({'id': 'v1'}) self.datalist['v2'] = View({'id': 'v2'}) - def test_getitem(self): - self.assertIs(self.mmif_obj['v1'], self.datalist['v1']) - - def test_getitem_raises(self): - with self.assertRaises(KeyError): - _ = self.datalist['reserved_names'] - def test_append(self): self.assertTrue('v256' not in self.datalist._items) self.datalist.append(View({'id': 'v256'})) @@ -1462,7 +1456,7 @@ def test_setitem_fail_on_reserved_name(self): self.assertEqual("can't set item on a reserved name", ke.args[0]) def test_get(self): - self.assertEqual(self.datalist['v1'], self.datalist.get('v1')) + self.assertEqual(self.datalist.get_by_key('v1'), self.datalist.get('v1')) def test_update(self): other_contains = """{ From 5d41378a34709527f1f58e9d88554d2552e43463 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Mon, 6 Oct 2025 12:51:14 -0400 Subject: [PATCH 03/14] replacing deepdiff with custom serialization logic --- mmif/serialize/annotation.py | 4 +- mmif/serialize/mmif.py | 15 ++++--- mmif/serialize/model.py | 58 +++++++++++++++++------- mmif/serialize/view.py | 6 ++- requirements.txt | 2 +- tests/test_serialize.py | 85 ++++++++++++++++++++++++++++++++++++ 6 files changed, 143 insertions(+), 27 deletions(-) diff --git a/mmif/serialize/annotation.py b/mmif/serialize/annotation.py index 0b2d3ad6..35be2f8c 100644 --- a/mmif/serialize/annotation.py +++ b/mmif/serialize/annotation.py @@ -559,8 +559,8 @@ def _deserialize(self, input_dict: dict) -> None: self.location = input_dict.pop("location") super()._deserialize(input_dict) - def _serialize(self, alt_container: Optional[Dict] = None) -> dict: - serialized = super()._serialize() + def _serialize(self, *args, **kwargs) -> dict: + serialized = super()._serialize(**kwargs) if "location_" in serialized: serialized["location"] = serialized.pop("location_") return serialized diff --git a/mmif/serialize/mmif.py b/mmif/serialize/mmif.py index 2715bae8..f701ad8d 100644 --- a/mmif/serialize/mmif.py +++ b/mmif/serialize/mmif.py @@ -179,17 +179,18 @@ def validate(json_str: Union[bytes, str, dict]) -> None: json_str = json.loads(json_str) jsonschema.validators.validate(json_str, schema) - def serialize(self, pretty: bool = False, sanitize: bool = False, autogenerate_capital_annotations=True) -> str: + def serialize(self, sanitize: bool = False, autogenerate_capital_annotations: bool = True, **kwargs) -> str: """ Serializes the MMIF object to a JSON string. - :param sanitize: If True, performs some sanitization of before returning + :param sanitize: If True, performs some sanitization of before returning the JSON string. See :meth:`sanitize` for details. - :param autogenerate_capital_annotations: If True, automatically convert - any "pending" temporary properties from `Document` objects to - `Annotation` objects. See :meth:`generate_capital_annotations` for + :param autogenerate_capital_annotations: If True, automatically convert + any "pending" temporary properties from `Document` objects to + `Annotation` objects. See :meth:`generate_capital_annotations` for details. - :param pretty: If True, returns string representation with indentation. + :param kwargs: Keyword arguments to pass to the parent's ``serialize`` + method (e.g., ``pretty=True``, ``include_context=False``). :return: JSON string of the MMIF object. """ if autogenerate_capital_annotations: @@ -197,7 +198,7 @@ def serialize(self, pretty: bool = False, sanitize: bool = False, autogenerate_c # sanitization should be done after `Annotation` annotations are generated if sanitize: self.sanitize() - return super().serialize(pretty) + return super().serialize(**kwargs) def _deserialize(self, input_dict: dict) -> None: """ diff --git a/mmif/serialize/model.py b/mmif/serialize/model.py index 70ff9e25..f331c58b 100644 --- a/mmif/serialize/model.py +++ b/mmif/serialize/model.py @@ -16,8 +16,6 @@ from datetime import datetime from typing import Union, Any, Dict, Optional, TypeVar, Generic, Generator, Iterator, Type, Set, ClassVar -from deepdiff import DeepDiff - T = TypeVar('T') S = TypeVar('S') PRMTV_TYPES: Type = Union[str, int, float, bool, None] @@ -93,10 +91,12 @@ class MmifObject(object): '_unnamed_attributes', '_attribute_classes', '_required_attributes', - '_exclude_from_diff' + '_exclude_from_diff', + '_contextual_attributes' } _unnamed_attributes: Optional[dict] _exclude_from_diff: Set[str] + _contextual_attributes: Set[str] _attribute_classes: Dict[str, Type] = {} # Mapping: str -> Type def __init__(self, mmif_obj: Optional[Union[bytes, str, dict]] = None, *_) -> None: @@ -106,6 +106,8 @@ def __init__(self, mmif_obj: Optional[Union[bytes, str, dict]] = None, *_) -> No self._required_attributes = [] if not hasattr(self, '_exclude_from_diff'): self._exclude_from_diff = set() + if not hasattr(self, '_contextual_attributes'): + self._contextual_attributes = set() if not hasattr(self, '_unnamed_attributes'): self._unnamed_attributes = {} if mmif_obj is not None: @@ -139,16 +141,21 @@ def _named_attributes(self) -> Generator[str, None, None]: """ return (n for n in self.__dict__.keys() if n not in self.reserved_names) - def serialize(self, pretty: bool = False) -> str: + def serialize(self, pretty: bool = False, include_context: bool = True) -> str: """ Generates JSON representation of an object. :param pretty: If True, returns string representation with indentation. + :param include_context: If ``False``, excludes contextual attributes from + serialization. Contextual attributes hold information + that varies at runtime (e.g., timestamps) and do not + constitute the core information of the MMIF object. + This is useful for comparing two MMIF objects for equality. :return: JSON string of the object. """ - return json.dumps(self._serialize(), indent=2 if pretty else None, cls=MmifObjectEncoder) + return json.dumps(self._serialize(include_context=include_context), indent=2 if pretty else None, cls=MmifObjectEncoder) - def _serialize(self, alt_container: Optional[Dict] = None) -> dict: + def _serialize(self, alt_container: Optional[Dict] = None, include_context: bool = True) -> dict: """ Maps a MMIF object to a plain python dict object, rewriting internal keys that start with '_' to @@ -157,6 +164,8 @@ def _serialize(self, alt_container: Optional[Dict] = None) -> dict: If a subclass needs special treatment during the mapping, it needs to override this method. + :param alt_container: Alternative container to serialize from + :param include_context: See :meth:`serialize` for details. :return: the prepared dictionary """ container = alt_container if alt_container is not None else self._unnamed_attributes @@ -166,20 +175,32 @@ def _serialize(self, alt_container: Optional[Dict] = None) -> dict: if v is None: continue k = str(k) + if not include_context and k in self._contextual_attributes: + continue if k.startswith('_'): # _ as a placeholder ``@`` in json-ld k = f'@{k[1:]}' - serializing_obj[k] = v + # Recursively serialize nested MmifObjects with the same include_context parameter + if isinstance(v, MmifObject): + serializing_obj[k] = v._serialize(include_context=include_context) + else: + serializing_obj[k] = v except AttributeError as e: # means _unnamed_attributes is None, so nothing unnamed would be serialized pass for k, v in self.__dict__.items(): if k in self.reserved_names: continue + if not include_context and k in self._contextual_attributes: + continue if k not in self._required_attributes and self.is_empty(v): continue if k.startswith('_'): # _ as a placeholder ``@`` in json-ld k = f'@{k[1:]}' - serializing_obj[k] = v + # Recursively serialize nested MmifObjects with the same include_context parameter + if isinstance(v, MmifObject): + serializing_obj[k] = v._serialize(include_context=include_context) + else: + serializing_obj[k] = v return serializing_obj @staticmethod @@ -263,14 +284,21 @@ def _deserialize(self, input_dict: dict) -> None: self[k] = v def __str__(self) -> str: - return self.serialize(False) + return self.serialize() def __eq__(self, other) -> bool: + """ + Compares two MmifObject instances for equality by comparing their serialized + representations with contextual attributes excluded. + + This avoids issues with DeepDiff accessing properties that may raise exceptions, + and properly handles comparison by ignoring contextual attributes like timestamps + and stack traces that vary based on runtime environment. + + See https://github.com/clamsproject/mmif-python/issues/311 for details. + """ return isinstance(other, type(self)) and \ - len(DeepDiff(self, other, report_repetition=True, exclude_types=[datetime], - # https://github.com/clamsproject/mmif-python/issues/214 - exclude_paths=self._exclude_from_diff) - ) == 0 + self.serialize(include_context=False) == other.serialize(include_context=False) def __len__(self) -> int: """ @@ -356,7 +384,7 @@ def _serialize(self, *args, **kwargs) -> list: # pytype: disable=signature-mism :return: list of the values of the internal dictionary. """ - return list(super()._serialize(self._items).values()) + return list(super()._serialize(self._items, **kwargs).values()) def deserialize(self, mmif_json: Union[str, list]) -> None: # pytype: disable=signature-mismatch """ @@ -450,7 +478,7 @@ def __init__(self, mmif_obj: Optional[Union[bytes, str, dict]] = None, *_): super().__init__(mmif_obj) def _serialize(self, *args, **kwargs) -> dict: - return super()._serialize(self._items) + return super()._serialize(self._items, **kwargs) def get(self, key: T, default=None) -> Optional[S]: return self._items.get(key, default) diff --git a/mmif/serialize/view.py b/mmif/serialize/view.py index d8821c5b..740a98f9 100644 --- a/mmif/serialize/view.py +++ b/mmif/serialize/view.py @@ -346,6 +346,7 @@ def __init__(self, viewmetadata_obj: Optional[Union[bytes, str, dict]] = None, * self.error: Union[dict, ErrorDict] = {} self.warnings: List[str] = [] self._required_attributes = ["app"] + self._contextual_attributes = {"timestamp"} self._attribute_classes = { 'error': ErrorDict, 'contains': ContainsDict @@ -357,8 +358,8 @@ def __init__(self, viewmetadata_obj: Optional[Union[bytes, str, dict]] = None, * # also see this class' `_serialize()` override implementation super().__init__(viewmetadata_obj) - def _serialize(self, alt_container: Optional[Dict] = None) -> dict: - serialized = super()._serialize() + def _serialize(self, *args, **kwargs) -> dict: + serialized = super()._serialize(**kwargs) # `_serialize()` eliminates any *empty* attributes, so # when no "contains", "errors", nor "warnings", at least add an empty contains back if not (self.contains.items() or self.error or self.warnings): @@ -459,6 +460,7 @@ class ErrorDict(MmifObject): def __init__(self, error_obj: Optional[Union[bytes, str, dict]] = None, *_) -> None: self.message: str = '' self.stackTrace: str = '' + self._contextual_attributes = {"stackTrace"} super().__init__(error_obj) def __str__(self): diff --git a/requirements.txt b/requirements.txt index ccf8bbd8..a97c214e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -deepdiff>5 + orderly-set==5.3.* # 5.4 drops py38 support jsonschema diff --git a/tests/test_serialize.py b/tests/test_serialize.py index bec77d28..45932625 100644 --- a/tests/test_serialize.py +++ b/tests/test_serialize.py @@ -28,6 +28,37 @@ tester_appname = 'http://not.existing/app' +class TestMmifObject(unittest.TestCase): + + def test_setattr_additional_properties_disallowed(self): + """Test that setting additional properties raises AttributeError when disallowed""" + # Create an object that disallows additional properties + obj = MmifObject() + obj._unnamed_attributes = None # Disallow additional properties + + with self.assertRaises(AttributeError) as cm: + obj.test_prop = "value" + self.assertIn("Additional properties are disallowed", str(cm.exception)) + + def test_setattr_additional_properties_allowed(self): + """Test that setting additional properties works when allowed""" + obj = MmifObject() + obj._unnamed_attributes = {} # Allow additional properties + + obj.test_prop = "value" + self.assertEqual(obj.test_prop, "value") + self.assertEqual(obj._unnamed_attributes["test_prop"], "value") + + def test_serialize_unnamed_attributes_none(self): + """Test serialization when _unnamed_attributes is None""" + obj = MmifObject() + obj._unnamed_attributes = None + + # This should not raise an error, but handle AttributeError gracefully + serialized = obj.serialize() + self.assertIsInstance(serialized, str) + + class TestMmif(unittest.TestCase): def setUp(self) -> None: @@ -478,6 +509,60 @@ def test_eq_checking_order(self): mmif4.add_view(view2) self.assertTrue(mmif3 == mmif4) + def test_eq_basic(self): + """Test basic equality comparison (issue #311)""" + minimal_mmif = ''' + { + "metadata": { + "mmif": "http://mmif.clams.ai/1.0.0" + }, + "documents": [ + { + "@type": "http://mmif.clams.ai/vocabulary/VideoDocument/v1", + "properties": { + "mime": "video", + "id": "d1", + "location": "file:///test.mp4" + } + } + ], + "views": [] + }''' + m1 = Mmif(minimal_mmif) + m2 = Mmif(minimal_mmif) + self.assertTrue(m1 == m2) + + def test_eq_with_different_documents(self): + """Test inequality when documents differ (issue #311)""" + mmif1_str = ''' + { + "metadata": {"mmif": "http://mmif.clams.ai/1.0.0"}, + "documents": [{"@type": "http://mmif.clams.ai/vocabulary/VideoDocument/v1", + "properties": {"mime": "video", "id": "d1", "location": "file:///test1.mp4"}}], + "views": [] + }''' + mmif2_str = mmif1_str.replace("d1", "d2") + m1 = Mmif(mmif1_str) + m2 = Mmif(mmif2_str) + self.assertFalse(m1 == m2) + + def test_eq_ignores_contextual_attributes(self): + """Test that contextual attributes (timestamps) are ignored in equality comparison (issue #311)""" + from datetime import datetime, timedelta + mmif_str = ''' + { + "metadata": {"mmif": "http://mmif.clams.ai/1.0.0"}, + "documents": [{"@type": "http://mmif.clams.ai/vocabulary/VideoDocument/v1", + "properties": {"mime": "video", "id": "d1", "location": "file:///test.mp4"}}], + "views": [{"id": "v1", "metadata": {"app": "http://mmif.clams.ai/apps/test/1.0", "contains": {}}, "annotations": []}] + }''' + m1 = Mmif(mmif_str) + m2 = Mmif(mmif_str) + # Set different timestamps + m1.views.get('v1').metadata.timestamp = datetime.now() + m2.views.get('v1').metadata.timestamp = datetime.now() + timedelta(seconds=10) + self.assertEqual(m1, m2) + def test___getitem__(self): mmif_obj = Mmif(MMIF_EXAMPLES['everything']) self.assertIsInstance(mmif_obj['m1'], Document) From 7fa50e747adff9fdc04c1285b599789cf800581c Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Wed, 8 Oct 2025 20:51:01 -0400 Subject: [PATCH 04/14] more updates to test cases --- tests/test_serialize.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/tests/test_serialize.py b/tests/test_serialize.py index 45932625..1bf6ff0e 100644 --- a/tests/test_serialize.py +++ b/tests/test_serialize.py @@ -64,10 +64,9 @@ class TestMmif(unittest.TestCase): def setUp(self) -> None: self.mmif_examples_json = {k: json.loads(v) for k, v in MMIF_EXAMPLES.items()} - @pytest.mark.skip("comparing two `Mmif` objs with an arbitrary file path included won't work until https://github.com/seperman/deepdiff/issues/357 is addressed") def test_init_from_bytes(self): - mmif_from_str = Mmif(EVERYTHING_JSON) - mmif_from_bytes = Mmif(EVERYTHING_JSON.encode('utf8')) + mmif_from_str = Mmif(MMIF_EXAMPLES['everything']) + mmif_from_bytes = Mmif(MMIF_EXAMPLES['everything'].encode('utf8')) self.assertEqual(mmif_from_str, mmif_from_bytes) def test_str_mmif_deserialize(self): @@ -487,10 +486,9 @@ def test_add_view(self): except KeyError: self.fail("raised exception on duplicate ID add when overwrite was set to True") - @pytest.mark.skip("comparing two `Mmif` objs with an arbitrary file path included won't work until https://github.com/seperman/deepdiff/issues/357 is addressed") def test_eq_checking_order(self): - mmif1 = Mmif(EVERYTHING_JSON) - mmif2 = Mmif(EVERYTHING_JSON) + mmif1 = Mmif(MMIF_EXAMPLES['everything']) + mmif2 = Mmif(MMIF_EXAMPLES['everything']) view1 = View() view1.id = 'v99' view2 = View() @@ -501,8 +499,8 @@ def test_eq_checking_order(self): mmif2.add_view(view1) self.assertFalse(mmif1 == mmif2) - mmif3 = Mmif(EVERYTHING_JSON) - mmif4 = Mmif(EVERYTHING_JSON) + mmif3 = Mmif(MMIF_EXAMPLES['everything']) + mmif4 = Mmif(MMIF_EXAMPLES['everything']) mmif3.add_view(view1) mmif3.add_view(view2) mmif4.add_view(view1) From 2bd897b8cd6aa4db160fc81140a7a72d17ad6113 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Tue, 28 Oct 2025 20:33:46 -0400 Subject: [PATCH 05/14] updated outdated docstrings --- mmif/serialize/annotation.py | 45 ++++++++++++---------- mmif/serialize/mmif.py | 30 ++++++++------- mmif/serialize/model.py | 10 ++--- mmif/serialize/view.py | 20 +++++----- mmif/utils/sequence_helper.py | 10 ++--- mmif/utils/text_document_helper.py | 9 +++++ mmif/utils/video_document_helper.py | 17 +++++--- templates/python/vocabulary/base_types.txt | 11 ++++++ 8 files changed, 92 insertions(+), 60 deletions(-) diff --git a/mmif/serialize/annotation.py b/mmif/serialize/annotation.py index 0b2d3ad6..9987a600 100644 --- a/mmif/serialize/annotation.py +++ b/mmif/serialize/annotation.py @@ -74,10 +74,11 @@ def _deserialize(self, input_dict: dict) -> None: def _cache_alignment(self, alignment_ann: 'Annotation', alignedto_ann: 'Annotation') -> None: """ - Cache alignment information. This cache will not be serialized. - + Cache alignment information. This cache will not be serialized. + :param alignment_ann: the Alignment annotation that has this annotation on one side :param alignedto_ann: the annotation that this annotation is aligned to (other side of Alignment) + :return: None """ self._alignments[alignment_ann] = alignedto_ann @@ -228,10 +229,9 @@ def add_property(self, name: str, value: Union[PRMTV_TYPES, LIST_PRMTV, LIST_LIST_PRMTV, DICT_PRMTV, DICT_LIST_PRMTV]) -> None: """ Adds a property to the annotation's properties. - + :param name: the name of the property :param value: the property's desired value - :return: None """ # if self.check_prop_value_is_simple_enough(value): self.properties[name] = value @@ -336,29 +336,32 @@ def add_property(self, name: str, ) -> None: """ Adds a property to the document's properties. - - Unlike the parent :class:`Annotation` class, added properties of a - ``Document`` object can be lost during serialization unless it belongs - to somewhere in a ``Mmif`` object. This is because we want to keep - ``Document`` object as "read-only" as possible. Thus, if you want to add - a property to a ``Document`` object, - - * add the document to a ``Mmif`` object (either in the documents list or + + Unlike the parent :class:`Annotation` class, added properties of a + ``Document`` object can be lost during serialization unless it belongs + to somewhere in a ``Mmif`` object. This is because we want to keep + ``Document`` object as "read-only" as possible. Thus, if you want to add + a property to a ``Document`` object, + + * add the document to a ``Mmif`` object (either in the documents list or in a view from the views list), or * directly write to ``Document.properties`` instead of using this method - (which is not recommended). - - With the former method, the SDK will record the added property as a - `Annotation` annotation object, separate from the original `Document` + (which is not recommended). + + With the former method, the SDK will record the added property as a + `Annotation` annotation object, separate from the original `Document` object. See :meth:`.Mmif.generate_capital_annotations()` for more. - + A few notes to keep in mind: - - #. You can't overwrite an existing property of a ``Document`` object. - #. A MMIF can have multiple ``Annotation`` objects with the same + + #. You can't overwrite an existing property of a ``Document`` object. + #. A MMIF can have multiple ``Annotation`` objects with the same property name but different values. When this happens, the SDK will - only keep the latest value (in order of appearances in views list) of + only keep the latest value (in order of appearances in views list) of the property, effectively overwriting the previous values. + + :param name: the name of the property + :param value: the property's desired value (note: Document accepts fewer value types than Annotation) """ # we don't checking if this k-v already exists in _original (new props) or _ephemeral (read from existing MMIF) # because it is impossible to keep the _original updated when a new annotation is added (via `new_annotation`) diff --git a/mmif/serialize/mmif.py b/mmif/serialize/mmif.py index 2715bae8..57c8d93f 100644 --- a/mmif/serialize/mmif.py +++ b/mmif/serialize/mmif.py @@ -183,13 +183,13 @@ def serialize(self, pretty: bool = False, sanitize: bool = False, autogenerate_c """ Serializes the MMIF object to a JSON string. - :param sanitize: If True, performs some sanitization of before returning + :param pretty: If True, returns string representation with indentation. + :param sanitize: If True, performs some sanitization of before returning the JSON string. See :meth:`sanitize` for details. - :param autogenerate_capital_annotations: If True, automatically convert - any "pending" temporary properties from `Document` objects to - `Annotation` objects. See :meth:`generate_capital_annotations` for + :param autogenerate_capital_annotations: If True, automatically convert + any "pending" temporary properties from `Document` objects to + `Annotation` objects. See :meth:`generate_capital_annotations` for details. - :param pretty: If True, returns string representation with indentation. :return: JSON string of the MMIF object. """ if autogenerate_capital_annotations: @@ -408,13 +408,13 @@ def add_view(self, view: View, overwrite=False) -> None: """ Appends a View object to the views list. - Fails if there is already a view with the same ID or a document + Fails if there is already a view with the same ID or a document with the same ID in the MMIF object. - :param view: the Document object to add + :param view: the View object to add :param overwrite: if set to True, will overwrite an existing view with the same ID - :raises KeyError: if ``overwrite`` is set to False and existing + :raises KeyError: if ``overwrite`` is set to False and existing object (document or view) with the same ID exists :return: None """ @@ -503,6 +503,7 @@ def get_documents_locations(self, m_type: Union[DocumentTypes, str], path_only=F Only top-level documents have locations, so we only check them. :param m_type: the type to search for + :param path_only: if True, returns resolved file system paths instead of location URIs :return: a list of the values of the location fields in the corresponding documents """ docs = [document for document in self.documents if document.is_type(m_type) and document.location is not None] @@ -516,6 +517,7 @@ def get_document_location(self, m_type: Union[DocumentTypes, str], path_only=Fal Method to get the location of *first* document of given type. :param m_type: the type to search for + :param path_only: if True, returns resolved file system path instead of location URI :return: the value of the location field in the corresponding document """ # TODO (krim @ 8/10/20): Is returning the first location desirable? @@ -568,6 +570,8 @@ def get_alignments(self, at_type1: Union[str, ThingTypesBase], at_type2: Union[s """ Finds views where alignments between two given annotation types occurred. + :param at_type1: the first annotation type to search for alignments + :param at_type2: the second annotation type to search for alignments :return: a dict that keyed by view IDs (str) and has lists of alignment Annotation objects as values. """ v_and_a = {} @@ -691,8 +695,8 @@ def get_annotations_between_time(self, start: Union[int, float], end: Union[int, """ Finds annotations that are anchored between the given time points. - :param start: the start time point in the unit of `input_unit` - :param end: the end time point in the unit of `input_unit` + :param start: the start time point + :param end: the end time point :param time_unit: the unit of the input time points. Default is `ms`. :param at_types: a list of annotation types to filter with. Any type in this list will be included in the return. :return: an iterator of Annotation objects that are anchored between the given time points @@ -725,12 +729,12 @@ def get_annotations_between_time(self, start: Union[int, float], end: Union[int, def _get_linear_anchor_point(self, ann: Annotation, targets_sorted=False, start: bool = True) -> Union[int, float]: # TODO (krim @ 2/5/24): Update the return type once timeunits are unified to `ms` as integers (https://github.com/clamsproject/mmif/issues/192) """ - Retrieves the anchor point of the annotation. Currently, this method only supports linear anchors, + Retrieves the anchor point of the annotation. Currently, this method only supports linear anchors, namely time and text, hence does not work with spatial anchors (polygons or video-object). - + :param ann: An Annotation object that has a linear anchor point. Namely, some subtypes of `Region` vocabulary type. - :param start: If True, returns the start anchor point. Otherwise, returns the end anchor point. N/A for `timePoint` anchors. :param targets_sorted: If True, the method will assume that the targets are sorted in the order of the anchor points. + :param start: If True, returns the start anchor point. Otherwise, returns the end anchor point. N/A for `timePoint` anchors. :return: the anchor point of the annotation. 1d for linear regions (time, text) """ props = ann.properties diff --git a/mmif/serialize/model.py b/mmif/serialize/model.py index 70ff9e25..8f2ee634 100644 --- a/mmif/serialize/model.py +++ b/mmif/serialize/model.py @@ -157,6 +157,7 @@ def _serialize(self, alt_container: Optional[Dict] = None) -> dict: If a subclass needs special treatment during the mapping, it needs to override this method. + :param alt_container: optional alternative container dict to serialize instead of _unnamed_attributes :return: the prepared dictionary """ container = alt_container if alt_container is not None else self._unnamed_attributes @@ -375,14 +376,13 @@ def _deserialize(self, input_list: list) -> None: def get(self, key: str, default=None) -> Optional[T]: """ - Standard dictionary-style get() method, albeit with no ``default`` - parameter. Relies on the implementation of __getitem__. + Standard dictionary-style get() method. Relies on the implementation of __getitem__. - Will return ``None`` if the key is not found. + Will return the default value if the key is not found. :param key: the key to search for - :param default: the default value to return if the key is not found - :return: the value matching that key + :param default: the default value to return if the key is not found (defaults to None) + :return: the value matching that key, or the default value if not found """ try: return self[key] diff --git a/mmif/serialize/view.py b/mmif/serialize/view.py index d8821c5b..2fec31a1 100644 --- a/mmif/serialize/view.py +++ b/mmif/serialize/view.py @@ -215,7 +215,7 @@ def add_document(self, document: Document, overwrite=False) -> Annotation: :param document: the Document object to add :param overwrite: if set to True, will overwrite an existing view with the same ID - :return: None + :return: the added Document object (as an Annotation) """ return self.add_annotation(document, overwrite) @@ -270,18 +270,18 @@ def get_documents(self) -> List[Document]: def get_document_by_id(self, doc_id) -> Document: """ .. deprecated:: 1.1.0 - Will be removed in 2.0.0. - Use general ``Mmif.__getitem__()`` method instead to retrieve - any document across the MMIF, or View.__getitems__() to + Will be removed in 2.0.0. + Use general ``Mmif.__getitem__()`` method instead to retrieve + any document across the MMIF, or View.__getitems__() to retrieve documents within the view. - Thinly wraps the Mmif.__getitem__ method and returns an Annotation - object. Note that although this method is under View class, it can - be used to retrieve any annotation across the entire MMIF. + Thinly wraps the Mmif.__getitem__ method and returns a Document + object. Note that although this method is under View class, it can + be used to retrieve any document across the entire MMIF. - :param ann_id: the ID of the annotation to retrieve. - :return: found :class:`mmif.serialize.annotation.Annotation` object. - :raises KeyError: if the annotation with the given ID is not found + :param doc_id: the ID of the document to retrieve. + :return: found :class:`mmif.serialize.annotation.Document` object. + :raises KeyError: if the document with the given ID is not found """ warnings.warn( "View.get_document_by_id() is deprecated, use view[doc_id] instead.", diff --git a/mmif/utils/sequence_helper.py b/mmif/utils/sequence_helper.py index 5ea5289c..f4edcb19 100644 --- a/mmif/utils/sequence_helper.py +++ b/mmif/utils/sequence_helper.py @@ -150,17 +150,17 @@ def smooth_outlying_short_intervals(scores: List[float], works essentially in the "stitching" only mode. :param scores: **SORTED** list of scores to be smoothed. The score list - is assumed to be "exhaust" the entire time or space of + is assumed to be "exhaust" the entire time or space of the underlying document segment. (Sorted by the start, and then by the end of anchors) - :param min_score: minimum threshold to use to discard - low-scored units (strictly less than) - :param min_spseq_size: minimum size of a positive sequence not to be + :param min_spseq_size: minimum size of a positive sequence not to be smoothed (greater or equal to) :param min_snseq_size: minimum size of a negative sequence not to be smoothed (greater or equal to) + :param min_score: minimum threshold to use to discard + low-scored units (strictly less than) :return: list of tuples of start(inclusive)/end(exclusive) indices - of the "positive" sequences. Negative sequences (regardless of + of the "positive" sequences. Negative sequences (regardless of their size) are not included in the output. """ diff --git a/mmif/utils/text_document_helper.py b/mmif/utils/text_document_helper.py index 23cc92aa..87e8c478 100644 --- a/mmif/utils/text_document_helper.py +++ b/mmif/utils/text_document_helper.py @@ -4,6 +4,15 @@ def slice_text(mmif_obj, start: int, end: int, unit: str = "milliseconds") -> str: + """ + Extracts text from tokens within a specified time range. + + :param mmif_obj: MMIF object to search for tokens + :param start: start time point + :param end: end time point + :param unit: time unit for start and end parameters (default: "milliseconds") + :return: space-separated string of token words found in the time range + """ token_type = AnnotationTypes.Token anns_found = mmif_obj.get_annotations_between_time(start, end, unit) tokens_sliced = [] diff --git a/mmif/utils/video_document_helper.py b/mmif/utils/video_document_helper.py index b76ece7c..a1b9c59a 100644 --- a/mmif/utils/video_document_helper.py +++ b/mmif/utils/video_document_helper.py @@ -76,11 +76,12 @@ def get_framerate(video_document: Document) -> float: def extract_frames_as_images(video_document: Document, framenums: Iterable[int], as_PIL: bool = False, record_ffmpeg_errors: bool = False): """ Extracts frames from a video document as a list of :py:class:`numpy.ndarray`. - Use with :py:func:`sample_frames` function to get the list of frame numbers first. - + Use with :py:func:`sample_frames` function to get the list of frame numbers first. + :param video_document: :py:class:`~mmif.serialize.annotation.Document` instance that holds a video document (``"@type": ".../VideoDocument/..."``) :param framenums: iterable integers representing the frame numbers to extract :param as_PIL: return :py:class:`PIL.Image.Image` instead of :py:class:`~numpy.ndarray` + :param record_ffmpeg_errors: if True, records and warns about FFmpeg stderr output during extraction :return: frames as a list of :py:class:`~numpy.ndarray` or :py:class:`~PIL.Image.Image` """ import cv2 @@ -212,11 +213,11 @@ def sample_frames(start_frame: int, end_frame: int, sample_rate: float = 1) -> L """ Helper function to sample frames from a time interval. Can also be used as a "cutoff" function when used with ``start_frame==0`` and ``sample_rate==1``. - + :param start_frame: start frame of the interval :param end_frame: end frame of the interval :param sample_rate: sampling rate (or step) to configure how often to take a frame, default is 1, meaning all consecutive frames are sampled - + :return: list of frame numbers to extract """ if sample_rate < 1: raise ValueError(f"Sample rate must be greater than 1, but got {sample_rate}") @@ -235,9 +236,13 @@ def get_annotation_property(mmif, annotation, prop_name): .. deprecated:: 1.0.8 Will be removed in 2.0.0. Use :py:meth:`mmif.serialize.annotation.Annotation.get_property` method instead. - + Get a property value from an annotation. If the property is not found in the annotation, it will look up the metadata of the annotation's parent view and return the value from there. - xisting + + :param mmif: MMIF object containing the annotation + :param annotation: Annotation object to get property from + :param prop_name: name of the property to retrieve + :return: the property value """ warnings.warn(f'{__name__}() is deprecated. ' f'Directly ask the annotation for a property by calling annotation.get_property() instead.', diff --git a/templates/python/vocabulary/base_types.txt b/templates/python/vocabulary/base_types.txt index 09736741..dbff5e1d 100644 --- a/templates/python/vocabulary/base_types.txt +++ b/templates/python/vocabulary/base_types.txt @@ -17,6 +17,11 @@ class TypesBase(object): old_lapps_type_shortnames = {'Token', 'Sentence', 'Paragraph', 'Markable', 'NamedEntity', 'NounChunk', 'VerbChunk'} def __init__(self, type_uri: str): + """ + Initialize a vocabulary type. + + :param type_uri: full URI or short name of the vocabulary type + """ self.fuzzy_eq = False self.parse_names(type_uri) if self.__repr__() not in self.__class__._prefixes: @@ -122,6 +127,12 @@ class ClamsTypesBase(TypesBase): dev_version: ClassVar[str] = 'develop' def __init__(self, type_uri, fuzzymode=True): + """ + Initialize a CLAMS vocabulary type. + + :param type_uri: full URI or short name of the CLAMS vocabulary type + :param fuzzymode: if True, enables fuzzy equality comparison that ignores version differences + """ super().__init__(type_uri) self.fuzzy_eq = fuzzymode From c5ae73a277434dd55821fc6f035764e9d7135257 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Wed, 29 Oct 2025 20:38:33 -0400 Subject: [PATCH 06/14] big updates on docstring based on current `get_by_id` impl --- mmif/serialize/annotation.py | 90 ++++++++++++++----- mmif/serialize/mmif.py | 99 ++++++++++++++++++-- mmif/serialize/model.py | 170 ++++++++++++++++++++++++++++++----- mmif/serialize/view.py | 85 ++++++++++++++++-- 4 files changed, 388 insertions(+), 56 deletions(-) diff --git a/mmif/serialize/annotation.py b/mmif/serialize/annotation.py index 0b2d3ad6..fe914bb2 100644 --- a/mmif/serialize/annotation.py +++ b/mmif/serialize/annotation.py @@ -256,18 +256,39 @@ def __getitem__(self, prop_name: str): def get(self, prop_name: str, default=None): """ - A getter for Annotation, will search for a property by its name, - and return the value if found, or the default value if not found. - This is designed to allow for directly accessing properties without - having to go through the properties object, or view-level - annotation metadata (common properties) encoded in the - ``view.metadata.contains`` dict. Note that the regular properties - will take the priority over the view-level common properties when - there are name conflicts. - - :param prop_name: the name of the property to get - :param default: the value to return if the property is not found - :return: the value of the property + Safe property access with optional default value. + + Searches for an annotation property by name and returns its value, + or a default value if not found. This method searches in multiple + locations with the following priority: + + 1. Direct properties (in ``annotation.properties``) + 2. Ephemeral properties (view-level metadata from ``contains``) + 3. Special fields (``@type``, ``properties``) + + This allows convenient access to properties without explicitly + checking the ``properties`` object or view-level metadata. + + :param prop_name: The name of the property to retrieve + :param default: The value to return if the property is not found (default: None) + :return: The property value, or the default value if not found + + Examples + -------- + >>> # Access annotation properties: + >>> label = annotation.get('label', default='unknown') + >>> start_time = annotation.get('start', default=0) + >>> + >>> # Access @type: + >>> at_type = annotation.get('@type') + >>> + >>> # Safe access with custom default: + >>> targets = annotation.get('targets', default=[]) + + See Also + -------- + __getitem__ : Direct property access that raises KeyError when not found + get_property : Alias for this method """ try: return self.__getitem__(prop_name) @@ -378,13 +399,42 @@ def add_property(self, name: str, def get(self, prop_name, default=None): """ - A special getter for Document properties. The major difference from - the super class's :py:meth:`Annotation.get` method is that Document - class has one more set of *"pending"* properties, that are added after - the Document object is created and will be serialized as a separate - :py:class:`Annotation` object of which ``@type = Annotation``. The - pending properties will take the priority over the regular properties - when there are conflicts. + Safe property access with optional default value for Document objects. + + Searches for a document property by name and returns its value, or a + default value if not found. Documents have a more complex property + hierarchy than regular annotations: + + Priority order (highest to lowest): + 1. Special fields ('id', 'location') + 2. Pending properties (added after creation, to be serialized as ``Annotation`` objects) + 3. Ephemeral properties (from existing ``Annotation`` annotations or view metadata) + 4. Original properties (in ``document.properties``) + + This allows convenient access to all document properties regardless of + where they're stored internally. + + :param prop_name: The name of the property to retrieve + :param default: The value to return if the property is not found (default: None) + :return: The property value, or the default value if not found + + Examples + -------- + >>> # Access document properties: + >>> mime = document.get('mime', default='application/octet-stream') + >>> location = document.get('location') + >>> + >>> # Access properties added after creation (pending): + >>> author = document.get('author', default='anonymous') + >>> publisher = document.get('publisher') + >>> + >>> # Access ephemeral properties from Annotation objects: + >>> sentiment = document.get('sentiment', default='neutral') + + See Also + -------- + add_property : Add a new property to the document + Mmif.generate_capital_annotations : How pending properties are serialized """ if prop_name == 'id': # because all three dicts have `id` key as required field, we need @@ -399,7 +449,7 @@ class has one more set of *"pending"* properties, that are added after elif prop_name in self._props_ephemeral: return self._props_ephemeral[prop_name] else: - return super().get(prop_name) + return super().get(prop_name, default) get_property = get diff --git a/mmif/serialize/mmif.py b/mmif/serialize/mmif.py index a4c4512a..20f25f11 100644 --- a/mmif/serialize/mmif.py +++ b/mmif/serialize/mmif.py @@ -2,7 +2,16 @@ The :mod:`mmif` module contains the classes used to represent a full MMIF file as a live Python object. +The :class:`Mmif` class is a high-level container that provides convenient +string-based access to documents, views, and annotations via ``mmif[id]``. +The underlying ``documents`` and ``views`` attributes are list-like collections +that use integer indexing; use ``get_by_key()`` on those for ID-based access. + See the specification docs and the JSON Schema file for more information. + +.. versionchanged:: 1.1.3 + DocumentsList and ViewsList now only accept integer/slice indexing. + Use ``get_by_key()`` for string ID access on those collections. """ import json @@ -139,8 +148,39 @@ class Mmif(MmifObject): """ MmifObject that represents a full MMIF file. + This is a high-level container object that provides convenient string-based + access to documents, views, and annotations using their IDs. The underlying + collections (``documents`` and ``views``) are list-like and use integer + indexing, but Mmif itself accepts string IDs for convenient access. + :param mmif_obj: the JSON data :param validate: whether to validate the data against the MMIF JSON schema. + + Examples + -------- + Accessing objects by ID (high-level, convenient): + + >>> mmif = Mmif(mmif_json) + >>> doc = mmif['m1'] # Get document by ID + >>> view = mmif['v1'] # Get view by ID + >>> ann = mmif['v1:a1'] # Get annotation by long-form ID + >>> + >>> # Safe access with default: + >>> doc = mmif.get('m99', default=None) + + Accessing via underlying lists (positional access): + + >>> first_doc = mmif.documents[0] # First document + >>> last_view = mmif.views[-1] # Last view + >>> all_views = mmif.views[1:4] # Slice of views + >>> + >>> # Access by ID on lists: + >>> doc = mmif.documents.get_by_key('m1') + >>> view = mmif.views.get_by_key('v1') + + .. versionchanged:: 1.1.3 + Underlying ``documents`` and ``views`` lists now only accept integer + indexing. Use ``get_by_key()`` for string ID access on those lists. """ def __init__(self, mmif_obj: Optional[Union[bytes, str, dict]] = None, *, validate: bool = True) -> None: @@ -768,15 +808,56 @@ def get_end(self, annotation: Annotation) -> Union[int, float]: def __getitem__(self, item: str) -> Union[Document, View, Annotation, MmifMetadata]: """ - index ([]) implementation for Mmif. This will try to find any object, given an identifier or an immediate - attribute name. When nothing is found, this will raise an error rather than returning a None - - :raises KeyError: if the item is not found or if the search results are ambiguous - :param item: an attribute name or an object identifier (a document ID, a view ID, or an annotation ID). When - annotation ID is given as a "short" ID (without view ID prefix), the method will try to find a - match from the first view, and return immediately if found. - :return: the object searched for - :raise KeyError: if the item is not found or multiple objects are found with the same ID + High-level string-based access to MMIF objects by their IDs. + + This method provides convenient access to documents, views, and annotations + using their string identifiers. For long-form annotation IDs (format: ``V:A``), + performs a two-level search through the specified view. + + Note: This is a high-level convenience method on the Mmif container itself. + The underlying ``documents`` and ``views`` collections are list-like and + only support integer indexing - use ``get_by_key()`` on those for ID-based access. + + :param item: An object identifier: + - Document ID (e.g., 'm1', 'd1') + - View ID (e.g., 'v1', 'v_0') + - Annotation ID in long form (e.g., 'v1:a1', 'v1:tf1') + - Attribute name (e.g., 'metadata', 'documents', 'views') + :return: The requested Document, View, Annotation, or attribute object + :raises KeyError: If the item is not found + + Examples + -------- + High-level access by ID: + + >>> mmif = Mmif(mmif_json) + >>> + >>> # Access documents: + >>> doc = mmif['m1'] # Returns Document with ID 'm1' + >>> + >>> # Access views: + >>> view = mmif['v1'] # Returns View with ID 'v1' + >>> + >>> # Access annotations (long-form ID): + >>> ann = mmif['v1:a1'] # Returns Annotation from view v1 + >>> + >>> # Access attributes: + >>> metadata = mmif['metadata'] # Returns MmifMetadata object + >>> + >>> # Will raise KeyError: + >>> doc = mmif['nonexistent'] # KeyError! + + For list-style positional access, use the underlying collections: + + >>> first_doc = mmif.documents[0] # Integer index + >>> second_view = mmif.views[1] # Integer index + >>> doc_by_id = mmif.documents.get_by_key('m1') # ID-based access + + See Also + -------- + get : Safe access with default value instead of raising KeyError + documents.get_by_key : ID-based access on the documents list + views.get_by_key : ID-based access on the views list """ if item in self._named_attributes(): return self.__dict__.__getitem__(item) diff --git a/mmif/serialize/model.py b/mmif/serialize/model.py index 671ff228..5d6952d3 100644 --- a/mmif/serialize/model.py +++ b/mmif/serialize/model.py @@ -10,6 +10,16 @@ core functionality for deserializing MMIF JSON data into live objects and serializing live objects into MMIF JSON data. Specialized behavior for the different components of MMIF is added in the subclasses. + +This module defines two main collection types: + +- :class:`DataList`: List-like collections that support integer/slice indexing. + Use ``get_by_key()`` for string ID access. +- :class:`DataDict`: Dict-like collections that support string key access. + +.. versionchanged:: 1.1.3 + DataList now only accepts integers/slices in ``__getitem__``. Use + ``get_by_key()`` for string-based ID access. """ import json @@ -317,17 +327,33 @@ def __getitem__(self, key) -> Any: def get(self, obj_id, default=None): """ - High-level getter for Mmif. This will try to find any object, given - an identifier or an immediate attribute name. When nothing is found, - this will return a default value instead of raising an error. + High-level safe getter that returns a default value instead of raising KeyError. + + This method wraps ``__getitem__()`` with exception handling, making it safe + to query for objects that might not exist. Available on all MmifObject subclasses. - :param obj_id: an immediate attribute name or an object identifier - (a document ID, a view ID, or an annotation ID). When - annotation ID is given as a "short" ID (without view - ID prefix), the method will try to find a match from - the first view, and return immediately if found. - :param default: the default value to return if none is found - :return: the object searched for or the default value + :param obj_id: An attribute name or object identifier (document ID, view ID, + annotation ID, or property name depending on the object type). + For Mmif objects: when annotation ID is given as a "short" ID + (without view ID prefix), searches from the first view. + :param default: The value to return if the key is not found (default: None) + :return: The object/value searched for, or the default value if not found + + Examples + -------- + Safe access pattern (works on all MmifObject subclasses): + + >>> # On Mmif objects: + >>> view = mmif.get('v1', default=None) # Returns None if not found + >>> doc = mmif.get('doc1', default=None) + >>> + >>> # On Annotation/Document objects: + >>> label = annotation.get('label', default='unknown') + >>> author = document.get('author', default='anonymous') + + See Also + -------- + __getitem__ : Direct access that raises KeyError when not found """ try: return self.__getitem__(obj_id) @@ -396,29 +422,78 @@ def _deserialize(self, input_list: list) -> None: def get(self, key: str, default=None) -> Optional[T]: """ .. deprecated:: 1.1.3 - Will be removed in 2.0.0. - Use `__getitem__` for integer access or `get_by_key` for string-based - ID or attribute access. + Do not use in new code. Will be removed in 2.0.0. + Use ``get_by_key()`` for string-based ID access or ``[int]`` for + positional access. + + Deprecated method for retrieving list elements by string ID. - Standard dictionary-style get() method, albeit with no ``default`` - parameter. Relies on the implementation of __getitem__. + :param key: The string ID of the element to search for + :param default: The default value to return if the key is not found + :return: The element matching the ID, or the default value - Will return ``None`` if the key is not found. + Examples + -------- + Old pattern (deprecated, do not use): - :param key: the key to search for - :param default: the default value to return if the key is not found - :return: the value matching that key + >>> view = mmif.views.get('v1') # DeprecationWarning! + + New patterns to use instead: + + >>> # For ID-based access: + >>> view = mmif.views.get_by_key('v1') + >>> # For positional access: + >>> view = mmif.views[0] + >>> # For high-level safe access: + >>> view = mmif.get('v1', default=None) + + See Also + -------- + get_by_key : Replacement method for string ID access + __getitem__ : List-style positional access with integers """ warnings.warn( - "The 'get' method is deprecated and will be removed in a future version. " - "Use `__getitem__` for integer access or `get_by_key` for string-based " \ - "ID or attribute access.", + "The 'get' method is deprecated and will be removed in 2.0.0. " + "Use `get_by_key()` for string-based ID access or `[int]` for " + "positional access.", DeprecationWarning, stacklevel=2 ) return self.get_by_key(key, default) def get_by_key(self, key: str, default=None) -> Optional[T]: + """ + .. versionadded:: 1.1.3 + + Retrieve an element from the list by its string ID. + + This is the recommended way to access list elements by their identifier. + Unlike ``__getitem__`` which only accepts integers for list-style access, + this method accepts string IDs and returns a default value if not found. + + :param key: The string ID of the element to retrieve + :param default: The value to return if the ID is not found (default: None) + :return: The element with the matching ID, or the default value + + Examples + -------- + Accessing elements by ID: + + >>> # Get a view by ID: + >>> view = mmif.views.get_by_key('v1') + >>> if view is None: + ... print("View not found") + >>> + >>> # Get with a custom default: + >>> doc = mmif.documents.get_by_key('doc99', default=default_doc) + >>> + >>> # Get annotation from a view: + >>> ann = view.annotations.get_by_key('v1:a1') + + See Also + -------- + __getitem__ : List-style positional access with integers/slices + """ return self._items.get(key, default) def _append_with_key(self, key: str, value: T, overwrite=False) -> None: @@ -445,6 +520,37 @@ def append(self, value, overwrite): raise NotImplementedError() def __getitem__(self, key: Union[int, slice]) -> Union[T, List[T]]: + """ + List-style positional access using integers or slices. + + This method provides pythonic list behavior - it only accepts integers + for positional access or slices for range access. For string-based ID + access, use ``get_by_key()`` instead. + + :param key: An integer index or slice object + :return: The element at the index, or a list of elements for slices + :raises TypeError: If key is not an integer or slice (e.g., if a string is passed) + + Examples + -------- + Positional access (pythonic list behavior): + + >>> # Get first view: + >>> first_view = mmif.views[0] + >>> + >>> # Get last document: + >>> last_doc = mmif.documents[-1] + >>> + >>> # Slice to get multiple elements: + >>> first_three_views = mmif.views[0:3] + >>> + >>> # This will raise TypeError: + >>> view = mmif.views['v1'] # TypeError! Use get_by_key('v1') instead + + See Also + -------- + get_by_key : String-based ID access with optional default value + """ if isinstance(key, (int, slice)): # Python's dicts preserve insertion order since 3.7. # We can convert values to a list and index it. @@ -487,6 +593,26 @@ def _serialize(self, *args, **kwargs) -> dict: return super()._serialize(self._items) def get(self, key: T, default=None) -> Optional[S]: + """ + Dictionary-style safe access with optional default value. + + This method provides pythonic dict behavior - returns the value for + the given key, or a default value if the key is not found. + + :param key: The key to look up + :param default: The value to return if key is not found (default: None) + :return: The value associated with the key, or the default value + + Examples + -------- + >>> # Access contains metadata: + >>> timeframe_meta = view.metadata.contains.get(AnnotationTypes.TimeFrame) + >>> if timeframe_meta is None: + ... print("No TimeFrame annotations in this view") + >>> + >>> # With custom default: + >>> value = some_dict.get('key', default={}) + """ return self._items.get(key, default) def _append_with_key(self, key: T, value: S, overwrite=False) -> None: diff --git a/mmif/serialize/view.py b/mmif/serialize/view.py index 82a624dc..557804a0 100644 --- a/mmif/serialize/view.py +++ b/mmif/serialize/view.py @@ -4,6 +4,15 @@ In MMIF, views are created by apps in a pipeline that are annotating data that was previously present in the MMIF file. + +The :class:`View` class is a high-level container that provides convenient +string-based access to annotations via ``view[id]``. The underlying +``annotations`` attribute is a list-like collection that uses integer indexing; +use ``get_by_key()`` on it for ID-based access. + +.. versionchanged:: 1.1.3 + AnnotationsList now only accepts integer/slice indexing. + Use ``get_by_key()`` for string ID access on that collection. """ import json import warnings @@ -26,9 +35,38 @@ class View(MmifObject): a list of annotations, and potentially a JSON-LD ``@context`` IRI. + This is a high-level container object that provides convenient string-based + access to annotations using their IDs. The underlying ``annotations`` collection + is list-like and uses integer indexing, but View itself accepts string IDs for + convenient access. + If ``view_obj`` is not provided, an empty View will be generated. :param view_obj: the JSON data that defines the view + + Examples + -------- + Accessing annotations by ID (high-level, convenient): + + >>> view = mmif['v1'] + >>> ann = view['v1:a1'] # Get annotation by ID + >>> doc = view['v1:td1'] # Get document by ID + >>> + >>> # Safe access with default: + >>> ann = view.get('v1:a999', default=None) + + Accessing via underlying list (positional access): + + >>> first_ann = view.annotations[0] # First annotation + >>> last_ann = view.annotations[-1] # Last annotation + >>> some_anns = view.annotations[1:5] # Slice of annotations + >>> + >>> # Access by ID on list: + >>> ann = view.annotations.get_by_key('v1:a1') + + .. versionchanged:: 1.1.3 + Underlying ``annotations`` list now only accepts integer indexing. + Use ``get_by_key()`` for string ID access on that list. """ def __init__(self, view_obj: Optional[Union[bytes, str, dict]] = None, parent_mmif=None, *_) -> None: @@ -294,11 +332,48 @@ def get_document_by_id(self, doc_id) -> Document: def __getitem__(self, key: str) -> 'Annotation': """ - index ([]) implementation for View. - - :raises KeyError: if the key is not found - :param key: the search string. - :return: the :class:`mmif.serialize.annotation.Annotation` object searched for + High-level string-based access to annotations by their IDs. + + This method provides convenient access to annotations and documents + within this view using their string identifiers. + + Note: This is a high-level convenience method on the View container itself. + The underlying ``annotations`` collection is list-like and only supports + integer indexing - use ``get_by_key()`` on it for ID-based access. + + :param key: The annotation or document ID (e.g., 'v1:a1', 'v1:td1'), + or an attribute name (e.g., 'metadata', 'annotations') + :return: The requested Annotation, Document, or attribute object + :raises KeyError: If the key is not found + + Examples + -------- + High-level access by ID: + + >>> view = mmif['v1'] + >>> + >>> # Access annotations: + >>> ann = view['v1:a1'] # Returns Annotation with ID 'v1:a1' + >>> + >>> # Access text documents in view: + >>> doc = view['v1:td1'] # Returns Document with ID 'v1:td1' + >>> + >>> # Access attributes: + >>> metadata = view['metadata'] # Returns ViewMetadata object + >>> + >>> # Will raise KeyError: + >>> ann = view['nonexistent'] # KeyError! + + For list-style positional access, use the underlying collection: + + >>> first_ann = view.annotations[0] # Integer index + >>> ann_by_id = view.annotations.get_by_key('v1:a1') # ID-based access + + See Also + -------- + get : Safe access with default value instead of raising KeyError + annotations.get_by_key : ID-based access on the annotations list + get_annotations : Search for annotations by type or properties """ if key in self._named_attributes(): return self.__dict__[key] From 15dd3df77ccfff1b405a7a13dc59c8003512bc17 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Tue, 18 Nov 2025 15:50:08 -0500 Subject: [PATCH 07/14] get rid of intermediate `get_by_key` implementation and deprecating list-like classes' `get` once and for all --- mmif/serialize/annotation.py | 1 + mmif/serialize/mmif.py | 27 ++---- mmif/serialize/model.py | 79 +++++------------ mmif/serialize/view.py | 24 ++---- tests/mmif_examples.py | 42 ++++++++- tests/test_serialize.py | 162 +++++++++++++++++++++++++++++++++-- 6 files changed, 229 insertions(+), 106 deletions(-) diff --git a/mmif/serialize/annotation.py b/mmif/serialize/annotation.py index f3b23fc8..c7c00f3d 100644 --- a/mmif/serialize/annotation.py +++ b/mmif/serialize/annotation.py @@ -232,6 +232,7 @@ def add_property(self, name: str, :param name: the name of the property :param value: the property's desired value + :return: None """ # if self.check_prop_value_is_simple_enough(value): self.properties[name] = value diff --git a/mmif/serialize/mmif.py b/mmif/serialize/mmif.py index dfd905fe..e8957682 100644 --- a/mmif/serialize/mmif.py +++ b/mmif/serialize/mmif.py @@ -5,13 +5,9 @@ The :class:`Mmif` class is a high-level container that provides convenient string-based access to documents, views, and annotations via ``mmif[id]``. The underlying ``documents`` and ``views`` attributes are list-like collections -that use integer indexing; use ``get_by_key()`` on those for ID-based access. +that use integer indexing; use container-level access for ID-based lookups. See the specification docs and the JSON Schema file for more information. - -.. versionchanged:: 1.1.3 - DocumentsList and ViewsList now only accept integer/slice indexing. - Use ``get_by_key()`` for string ID access on those collections. """ import json @@ -173,14 +169,6 @@ class Mmif(MmifObject): >>> first_doc = mmif.documents[0] # First document >>> last_view = mmif.views[-1] # Last view >>> all_views = mmif.views[1:4] # Slice of views - >>> - >>> # Access by ID on lists: - >>> doc = mmif.documents.get_by_key('m1') - >>> view = mmif.views.get_by_key('v1') - - .. versionchanged:: 1.1.3 - Underlying ``documents`` and ``views`` lists now only accept integer - indexing. Use ``get_by_key()`` for string ID access on those lists. """ def __init__(self, mmif_obj: Optional[Union[bytes, str, dict]] = None, *, validate: bool = True) -> None: @@ -821,7 +809,7 @@ def __getitem__(self, item: str) -> Union[Document, View, Annotation, MmifMetada Note: This is a high-level convenience method on the Mmif container itself. The underlying ``documents`` and ``views`` collections are list-like and - only support integer indexing - use ``get_by_key()`` on those for ID-based access. + only support integer indexing. :param item: An object identifier: - Document ID (e.g., 'm1', 'd1') @@ -856,31 +844,28 @@ def __getitem__(self, item: str) -> Union[Document, View, Annotation, MmifMetada >>> first_doc = mmif.documents[0] # Integer index >>> second_view = mmif.views[1] # Integer index - >>> doc_by_id = mmif.documents.get_by_key('m1') # ID-based access See Also -------- get : Safe access with default value instead of raising KeyError - documents.get_by_key : ID-based access on the documents list - views.get_by_key : ID-based access on the views list """ if item in self._named_attributes(): return self.__dict__.__getitem__(item) if self.id_delimiter in item: vid, _ = item.split(self.id_delimiter, 1) - view = self.views.get_by_key(vid) + view = self.views._items.get(vid) if view is None: raise KeyError(f"View with ID {vid} not found in the MMIF object.") - ann = view.annotations.get_by_key(item) + ann = view.annotations._items.get(item) if ann is None: raise KeyError(f"Annotation with ID {item} not found in the MMIF object.") return ann else: # search for document first, then views # raise KeyError if nothing is found - ret = self.documents.get_by_key(item) + ret = self.documents._items.get(item) if ret is None: - ret = self.views.get_by_key(item) + ret = self.views._items.get(item) if ret is None: raise KeyError(f"Object with ID {item} not found in the MMIF object. ") else: diff --git a/mmif/serialize/model.py b/mmif/serialize/model.py index bea8db88..3c5e6986 100644 --- a/mmif/serialize/model.py +++ b/mmif/serialize/model.py @@ -13,13 +13,12 @@ This module defines two main collection types: -- :class:`DataList`: List-like collections that support integer/slice indexing. - Use ``get_by_key()`` for string ID access. +- :class:`DataList`: List-like collections that support integer/slice + indexing. For ID-based access, use indexing or ``get`` in the container + level. For example, for DocumentList, use its parent Mmif object's + getter methods to access documents by ID. (e.g., ``mmif['doc1']``). - :class:`DataDict`: Dict-like collections that support string key access. -.. versionchanged:: 1.1.3 - DataList now only accepts integers/slices in ``__getitem__``. Use - ``get_by_key()`` for string-based ID access. """ import json @@ -451,13 +450,13 @@ def get(self, key: str, default=None) -> Optional[T]: """ .. deprecated:: 1.1.3 Do not use in new code. Will be removed in 2.0.0. - Use ``get_by_key()`` for string-based ID access or ``[int]`` for - positional access. + Use container-level access or positional indexing instead. Deprecated method for retrieving list elements by string ID. :param key: the key to search for - :param default: the default value to return if the key is not found (defaults to None) + :param default: the default value to return if the key is not found + (defaults to None) :return: the value matching that key, or the default value if not found Examples @@ -468,60 +467,24 @@ def get(self, key: str, default=None) -> Optional[T]: New patterns to use instead: - >>> # For ID-based access: - >>> view = mmif.views.get_by_key('v1') + >>> # For ID-based access, use container: + >>> view = mmif['v1'] + >>> # Or with safe access: + >>> view = mmif.get('v1', default=None) >>> # For positional access: >>> view = mmif.views[0] - >>> # For high-level safe access: - >>> view = mmif.get('v1', default=None) See Also -------- - get_by_key : Replacement method for string ID access __getitem__ : List-style positional access with integers """ warnings.warn( - "The 'get' method is deprecated and will be removed in 2.0.0. " - "Use `get_by_key()` for string-based ID access or `[int]` for " - "positional access.", + "The 'get' method on list-like collections is deprecated and " + "will be removed in 2.0.0. Use container-level access " + "(e.g., mmif['v1']) or positional indexing (e.g., views[0]).", DeprecationWarning, stacklevel=2 ) - return self.get_by_key(key, default) - - def get_by_key(self, key: str, default=None) -> Optional[T]: - """ - .. versionadded:: 1.1.3 - - Retrieve an element from the list by its string ID. - - This is the recommended way to access list elements by their identifier. - Unlike ``__getitem__`` which only accepts integers for list-style access, - this method accepts string IDs and returns a default value if not found. - - :param key: The string ID of the element to retrieve - :param default: The value to return if the ID is not found (default: None) - :return: The element with the matching ID, or the default value - - Examples - -------- - Accessing elements by ID: - - >>> # Get a view by ID: - >>> view = mmif.views.get_by_key('v1') - >>> if view is None: - ... print("View not found") - >>> - >>> # Get with a custom default: - >>> doc = mmif.documents.get_by_key('doc99', default=default_doc) - >>> - >>> # Get annotation from a view: - >>> ann = view.annotations.get_by_key('v1:a1') - - See Also - -------- - __getitem__ : List-style positional access with integers/slices - """ return self._items.get(key, default) def _append_with_key(self, key: str, value: T, overwrite=False) -> None: @@ -553,11 +516,12 @@ def __getitem__(self, key: Union[int, slice]) -> Union[T, List[T]]: This method provides pythonic list behavior - it only accepts integers for positional access or slices for range access. For string-based ID - access, use ``get_by_key()`` instead. + access, use container-level indexing instead (e.g., ``mmif['v1']``). :param key: An integer index or slice object :return: The element at the index, or a list of elements for slices - :raises TypeError: If key is not an integer or slice (e.g., if a string is passed) + :raises TypeError: If key is not an integer or slice (e.g., if a + string is passed) Examples -------- @@ -573,11 +537,10 @@ def __getitem__(self, key: Union[int, slice]) -> Union[T, List[T]]: >>> first_three_views = mmif.views[0:3] >>> >>> # This will raise TypeError: - >>> view = mmif.views['v1'] # TypeError! Use get_by_key('v1') instead - - See Also - -------- - get_by_key : String-based ID access with optional default value + >>> view = mmif.views['v1'] # TypeError! + >>> + >>> # For ID-based access, use container: + >>> view = mmif['v1'] # Correct way """ if isinstance(key, (int, slice)): # Python's dicts preserve insertion order since 3.7. diff --git a/mmif/serialize/view.py b/mmif/serialize/view.py index d9e853c1..9b419b80 100644 --- a/mmif/serialize/view.py +++ b/mmif/serialize/view.py @@ -8,11 +8,8 @@ The :class:`View` class is a high-level container that provides convenient string-based access to annotations via ``view[id]``. The underlying ``annotations`` attribute is a list-like collection that uses integer indexing; -use ``get_by_key()`` on it for ID-based access. +use container-level access for ID-based lookups. -.. versionchanged:: 1.1.3 - AnnotationsList now only accepts integer/slice indexing. - Use ``get_by_key()`` for string ID access on that collection. """ import json import warnings @@ -60,13 +57,6 @@ class View(MmifObject): >>> first_ann = view.annotations[0] # First annotation >>> last_ann = view.annotations[-1] # Last annotation >>> some_anns = view.annotations[1:5] # Slice of annotations - >>> - >>> # Access by ID on list: - >>> ann = view.annotations.get_by_key('v1:a1') - - .. versionchanged:: 1.1.3 - Underlying ``annotations`` list now only accepts integer indexing. - Use ``get_by_key()`` for string ID access on that list. """ def __init__(self, view_obj: Optional[Union[bytes, str, dict]] = None, parent_mmif=None, *_) -> None: @@ -325,7 +315,7 @@ def get_document_by_id(self, doc_id) -> Document: "View.get_document_by_id() is deprecated, use view[doc_id] instead.", DeprecationWarning ) - doc_found = self.annotations.get_by_key(doc_id) + doc_found = self.annotations._items.get(doc_id) if not isinstance(doc_found, Document): raise KeyError(f"Document \"{doc_id}\" not found in view {self.id}.") return cast(Document, doc_found) @@ -337,9 +327,9 @@ def __getitem__(self, key: str) -> 'Annotation': This method provides convenient access to annotations and documents within this view using their string identifiers. - Note: This is a high-level convenience method on the View container itself. - The underlying ``annotations`` collection is list-like and only supports - integer indexing - use ``get_by_key()`` on it for ID-based access. + Note: This is a high-level convenience method on the View container + itself. The underlying ``annotations`` collection is list-like and + only supports integer indexing. :param key: The annotation or document ID (e.g., 'v1:a1', 'v1:td1'), or an attribute name (e.g., 'metadata', 'annotations') @@ -367,17 +357,15 @@ def __getitem__(self, key: str) -> 'Annotation': For list-style positional access, use the underlying collection: >>> first_ann = view.annotations[0] # Integer index - >>> ann_by_id = view.annotations.get_by_key('v1:a1') # ID-based access See Also -------- get : Safe access with default value instead of raising KeyError - annotations.get_by_key : ID-based access on the annotations list get_annotations : Search for annotations by type or properties """ if key in self._named_attributes(): return self.__dict__[key] - anno_result = self.annotations.get_by_key(key) + anno_result = self.annotations._items.get(key) if not anno_result: raise KeyError("Annotation ID not found: %s" % key) return anno_result diff --git a/tests/mmif_examples.py b/tests/mmif_examples.py index 5635d144..7e860c0a 100644 --- a/tests/mmif_examples.py +++ b/tests/mmif_examples.py @@ -1,4 +1,6 @@ import itertools +import os +import subprocess from string import Template from urllib import request @@ -11,10 +13,44 @@ 'FRACTIONAL_EXAMPLES', ] + +def _load_from_url_or_git(url): + """ + Load content from URL or local git repository. + If LOCALMMIF env var is set, use git show to load from local repo. + LOCALMMIF should be the path to the local mmif repository. + """ + localmmif = os.environ.get('LOCALMMIF') + if localmmif: + # Extract the version/branch and file path from the URL + # URL format: https://raw.githubusercontent.com/clamsproject/mmif/{version}/{filepath} + url_prefix = "https://raw.githubusercontent.com/clamsproject/mmif/" + if url.startswith(url_prefix): + remainder = url[len(url_prefix):] + parts = remainder.split('/', 1) + if len(parts) == 2: + version, filepath = parts + # Use git show to get the file from the specific version + git_ref = f"{version}:{filepath}" + try: + result = subprocess.run( + ['git', 'show', git_ref], + cwd=localmmif, + capture_output=True, + text=True, + check=True + ) + return result.stdout + except subprocess.CalledProcessError as e: + raise RuntimeError(f"Failed to load {git_ref} from local git repo at {localmmif}: {e.stderr}") + + # Fallback to URL loading + return request.urlopen(url).read().decode('utf-8') + everything_file_url = f"https://raw.githubusercontent.com/clamsproject/mmif/{__specver__}/specifications/samples/everything/raw.json" -old_mmif_w_short_id = f"https://raw.githubusercontent.com/clamsproject/mmif/1.0.5/specifications/samples/everything/raw.json" -EVERYTHING_JSON = request.urlopen(everything_file_url).read().decode('utf-8') -OLD_SHORTID_JSON = request.urlopen(old_mmif_w_short_id).read().decode('utf-8') +old_mmif_w_short_id_url = f"https://raw.githubusercontent.com/clamsproject/mmif/1.0.5/specifications/samples/everything/raw.json" +EVERYTHING_JSON = _load_from_url_or_git(everything_file_url) +OLD_SHORTID_JSON = _load_from_url_or_git(old_mmif_w_short_id_url) SWT_1_0_JSON = open('tests/samples/1.0/swt.mmif').read() # for keys and values in chain all typevers in mmif.vocabulary.*_types modules diff --git a/tests/test_serialize.py b/tests/test_serialize.py index f68c2f5b..b0836c5a 100644 --- a/tests/test_serialize.py +++ b/tests/test_serialize.py @@ -885,7 +885,7 @@ def test_new_textdocument(self): self.assertTrue(td1.properties.text_value == td1.text_value) self.assertNotEqual(td1.text_language, td2.text_language) self.assertEqual(english_text, td1.text_value) - self.assertEqual(td1, self.view_obj.annotations.get_by_key(td1.id)) + self.assertEqual(td1, self.view_obj[td1.id]) td3 = self.view_obj.new_textdocument(english_text, mime='plain/text') self.assertEqual(td1.text_value, td3.text_value) self.assertEqual(len(td1.properties), len(td3.properties) - 1) @@ -1320,9 +1320,9 @@ def test_document_adding_duplicate_properties(self): doc1.add_property('publisher', 'they') self.assertEqual(2, len(doc1._props_pending)) mmif_roundtrip3 = Mmif(mmif_roundtrip2.serialize()) - r0_v_anns = list(mmif_roundtrip3.views.get_by_key(r0_vid).get_annotations(AnnotationTypes.Annotation)) - r1_v_anns = list(mmif_roundtrip3.views.get_by_key(r1_vid).get_annotations(AnnotationTypes.Annotation)) - r2_v_anns = list(mmif_roundtrip3.views.get_by_key(r2_vid).get_annotations(AnnotationTypes.Annotation)) + r0_v_anns = list(mmif_roundtrip3[r0_vid].get_annotations(AnnotationTypes.Annotation)) + r1_v_anns = list(mmif_roundtrip3[r1_vid].get_annotations(AnnotationTypes.Annotation)) + r2_v_anns = list(mmif_roundtrip3[r2_vid].get_annotations(AnnotationTypes.Annotation)) # two props (`author` and `publisher`) are serialized to one `Annotation` objects self.assertEqual(1, len(r0_v_anns)) self.assertEqual(0, len(r1_v_anns)) @@ -1408,7 +1408,7 @@ def test_capital_annotation_generation_viewfinder(self): mmif[f'doc{i+1}'].add_property('author', authors[i]) mmif_roundtrip = Mmif(mmif.serialize()) for i in range(1, 3): - cap_anns = list(mmif_roundtrip.views.get_by_key(f'v{i}').get_annotations(AnnotationTypes.Annotation)) + cap_anns = list(mmif_roundtrip[f'v{i}'].get_annotations(AnnotationTypes.Annotation)) self.assertEqual(1, len(cap_anns)) self.assertEqual(authors[i-1], cap_anns[0].get_property('author')) @@ -1539,7 +1539,157 @@ def test_setitem_fail_on_reserved_name(self): self.assertEqual("can't set item on a reserved name", ke.args[0]) def test_get(self): - self.assertEqual(self.datalist.get_by_key('v1'), self.datalist.get('v1')) + # Test that get() returns the correct view and returns default for + # non-existent IDs + view = self.datalist.get('v1') + self.assertIsNotNone(view) + self.assertEqual('v1', view.id) + + # Test default value + self.assertIsNone(self.datalist.get('nonexistent')) + self.assertEqual('default', self.datalist.get('nonexistent', 'default')) + + # New tests for pythonic getters (#295) + def test_integer_indexing(self): + """Test that DataList supports integer indexing (list-like behavior).""" + # Positive indexing + first_view = self.datalist[0] + self.assertEqual('v1', first_view.id) + + second_view = self.datalist[1] + self.assertEqual('v2', second_view.id) + + # Negative indexing + last_view = self.datalist[-1] + self.assertEqual('v8', last_view.id) + + second_to_last = self.datalist[-2] + self.assertEqual('v7', second_to_last.id) + + def test_slice_indexing(self): + """Test that DataList supports slice indexing (list-like behavior).""" + # Basic slice + first_three = self.datalist[0:3] + self.assertEqual(3, len(first_three)) + self.assertIsInstance(first_three, list) + self.assertEqual('v1', first_three[0].id) + self.assertEqual('v3', first_three[2].id) + + # Slice with step + every_other = self.datalist[::2] + self.assertEqual(4, len(every_other)) + self.assertEqual('v1', every_other[0].id) + self.assertEqual('v3', every_other[1].id) + self.assertEqual('v5', every_other[2].id) + + # Slice from middle + middle = self.datalist[2:5] + self.assertEqual(3, len(middle)) + self.assertEqual('v3', middle[0].id) + self.assertEqual('v4', middle[1].id) + + # Empty slice + empty = self.datalist[10:20] + self.assertEqual(0, len(empty)) + + def test_string_indexing_raises_typeerror(self): + """Test that DataList raises TypeError for string indexing.""" + # String indexing should raise TypeError + with self.assertRaises(TypeError) as cm: + _ = self.datalist['v1'] + self.assertIn("list indices must be integers or slices", str(cm.exception)) + self.assertIn("not str", str(cm.exception)) + + # Test with documents list too + with self.assertRaises(TypeError) as cm: + _ = self.mmif_obj.documents['m1'] + self.assertIn("list indices must be integers or slices", str(cm.exception)) + + def test_get_deprecated_warning(self): + """Test that get() method raises DeprecationWarning.""" + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + result = self.datalist.get('v1') + + # Check that warning was raised + self.assertEqual(1, len(w)) + self.assertTrue(issubclass(w[0].category, DeprecationWarning)) + self.assertIn("deprecated", str(w[0].message).lower()) + self.assertIn("2.0.0", str(w[0].message)) + + # But it should still work and return the view + self.assertIsNotNone(result) + self.assertEqual('v1', result.id) + + def test_high_level_mmif_string_access(self): + """Test that high-level Mmif container accepts string IDs.""" + # Mmif should accept string access (container behavior) + view = self.mmif_obj['v1'] + self.assertEqual('v1', view.id) + + doc = self.mmif_obj['m1'] + self.assertEqual('m1', doc.id) + + # Long-form annotation ID + ann = self.mmif_obj['v5:bb1'] + self.assertIsNotNone(ann) + + def test_high_level_view_string_access(self): + """Test that high-level View container accepts string IDs.""" + view = self.mmif_obj['v4'] + + # View should accept string access (container behavior) + ann = view['v4:a1'] + self.assertIsNotNone(ann) + self.assertEqual('v4:a1', ann.id) + + def test_mmif_get_with_default(self): + """Test safe access on Mmif with default values.""" + # Existing object + view = self.mmif_obj.get('v1') + self.assertIsNotNone(view) + self.assertEqual('v1', view.id) + + # Non-existent object with default + result = self.mmif_obj.get('v999', default=None) + self.assertIsNone(result) + + # Non-existent with custom default + default_value = "not found" + result = self.mmif_obj.get('v999', default=default_value) + self.assertEqual(default_value, result) + + def test_mixed_access_patterns(self): + """Test that different access patterns can be used together.""" + # Integer access on list + first_view = self.mmif_obj.views[0] + + # String access on high-level container + specific_view = self.mmif_obj['v5'] + + # All should work + self.assertEqual('v1', first_view.id) + self.assertEqual('v5', specific_view.id) + + def test_datalist_all_collections(self): + """Test that all DataList subclasses behave consistently.""" + # ViewsList + view_by_int = self.mmif_obj.views[0] + view_by_container = self.mmif_obj[view_by_int.id] + self.assertEqual(view_by_int.id, view_by_container.id) + + # DocumentsList + if len(self.mmif_obj.documents) > 0: + doc_by_int = self.mmif_obj.documents[0] + doc_by_container = self.mmif_obj[doc_by_int.id] + self.assertEqual(doc_by_int.id, doc_by_container.id) + + # AnnotationsList + view = self.mmif_obj['v4'] + if len(view.annotations) > 0: + ann_by_int = view.annotations[0] + ann_by_container = view[ann_by_int.id] + self.assertEqual(ann_by_int.id, ann_by_container.id) def test_update(self): other_contains = """{ From d0024a607f7c9f86d840f90147521c6fe0b1663d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 18 Nov 2025 23:16:32 +0000 Subject: [PATCH 08/14] Initial plan From ad8d4e2d3d6eb354a68c1529dd66e6c29c37b308 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 18 Nov 2025 23:21:29 +0000 Subject: [PATCH 09/14] Address PR feedback: simplify returns, use KeyError, add pathlib validation Co-authored-by: keighrim <9062727+keighrim@users.noreply.github.com> --- mmif/serialize/mmif.py | 5 +---- mmif/serialize/model.py | 4 ++-- tests/mmif_examples.py | 10 +++++++--- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/mmif/serialize/mmif.py b/mmif/serialize/mmif.py index e8957682..004be991 100644 --- a/mmif/serialize/mmif.py +++ b/mmif/serialize/mmif.py @@ -868,7 +868,4 @@ def __getitem__(self, item: str) -> Union[Document, View, Annotation, MmifMetada ret = self.views._items.get(item) if ret is None: raise KeyError(f"Object with ID {item} not found in the MMIF object. ") - else: - return ret - else: - return ret + return ret diff --git a/mmif/serialize/model.py b/mmif/serialize/model.py index 3c5e6986..b052b0fb 100644 --- a/mmif/serialize/model.py +++ b/mmif/serialize/model.py @@ -337,14 +337,14 @@ def __contains__(self, key: str) -> bool: try: self.__getitem__(key) return True - except (TypeError, AttributeError, KeyError): + except (TypeError, KeyError): return False def __getitem__(self, key) -> Any: if key in self._named_attributes(): value = self.__dict__[key] elif self._unnamed_attributes is None: - raise AttributeError(f"Additional properties are disallowed by {self.__class__}: {key}") + raise KeyError(f"Additional properties are disallowed by {self.__class__}: {key}") else: value = self._unnamed_attributes[key] if key not in self._required_attributes and self.is_empty(value): diff --git a/tests/mmif_examples.py b/tests/mmif_examples.py index 7e860c0a..b19f9d9a 100644 --- a/tests/mmif_examples.py +++ b/tests/mmif_examples.py @@ -1,6 +1,7 @@ import itertools import os import subprocess +from pathlib import Path from string import Template from urllib import request @@ -20,8 +21,11 @@ def _load_from_url_or_git(url): If LOCALMMIF env var is set, use git show to load from local repo. LOCALMMIF should be the path to the local mmif repository. """ - localmmif = os.environ.get('LOCALMMIF') - if localmmif: + localmmif_str = os.environ.get('LOCALMMIF') + if localmmif_str: + localmmif = Path(localmmif_str) + if not localmmif.is_dir(): + raise ValueError(f"LOCALMMIF path is not a valid directory: {localmmif}") # Extract the version/branch and file path from the URL # URL format: https://raw.githubusercontent.com/clamsproject/mmif/{version}/{filepath} url_prefix = "https://raw.githubusercontent.com/clamsproject/mmif/" @@ -35,7 +39,7 @@ def _load_from_url_or_git(url): try: result = subprocess.run( ['git', 'show', git_ref], - cwd=localmmif, + cwd=str(localmmif), capture_output=True, text=True, check=True From e32ec869d1bb500f2ed0773fc0d33f729fb6c86f Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Tue, 18 Nov 2025 19:28:47 -0500 Subject: [PATCH 10/14] fixed syntax error in docstrings --- .gitignore | 2 + mmif/serialize/annotation.py | 42 ++++++++++--------- mmif/serialize/mmif.py | 64 ++++++++++++++++------------- mmif/serialize/model.py | 80 ++++++++++++++++++++---------------- mmif/serialize/view.py | 54 +++++++++++++----------- 5 files changed, 137 insertions(+), 105 deletions(-) diff --git a/.gitignore b/.gitignore index 84afd451..5dd9b09b 100644 --- a/.gitignore +++ b/.gitignore @@ -82,3 +82,5 @@ mmif/vocabulary documentation/_build/ /VERSION +_issues + diff --git a/mmif/serialize/annotation.py b/mmif/serialize/annotation.py index c7c00f3d..6f1471aa 100644 --- a/mmif/serialize/annotation.py +++ b/mmif/serialize/annotation.py @@ -276,15 +276,17 @@ def get(self, prop_name: str, default=None): Examples -------- - >>> # Access annotation properties: - >>> label = annotation.get('label', default='unknown') - >>> start_time = annotation.get('start', default=0) - >>> - >>> # Access @type: - >>> at_type = annotation.get('@type') - >>> - >>> # Safe access with custom default: - >>> targets = annotation.get('targets', default=[]) + .. code-block:: python + + # Access annotation properties: + label = annotation.get('label', default='unknown') + start_time = annotation.get('start', default=0) + + # Access @type: + at_type = annotation.get('@type') + + # Safe access with custom default: + targets = annotation.get('targets', default=[]) See Also -------- @@ -424,16 +426,18 @@ def get(self, prop_name, default=None): Examples -------- - >>> # Access document properties: - >>> mime = document.get('mime', default='application/octet-stream') - >>> location = document.get('location') - >>> - >>> # Access properties added after creation (pending): - >>> author = document.get('author', default='anonymous') - >>> publisher = document.get('publisher') - >>> - >>> # Access ephemeral properties from Annotation objects: - >>> sentiment = document.get('sentiment', default='neutral') + .. code-block:: python + + # Access document properties: + mime = document.get('mime', default='application/octet-stream') + location = document.get('location') + + # Access properties added after creation (pending): + author = document.get('author', default='anonymous') + publisher = document.get('publisher') + + # Access ephemeral properties from Annotation objects: + sentiment = document.get('sentiment', default='neutral') See Also -------- diff --git a/mmif/serialize/mmif.py b/mmif/serialize/mmif.py index 004be991..2b759765 100644 --- a/mmif/serialize/mmif.py +++ b/mmif/serialize/mmif.py @@ -156,19 +156,23 @@ class Mmif(MmifObject): -------- Accessing objects by ID (high-level, convenient): - >>> mmif = Mmif(mmif_json) - >>> doc = mmif['m1'] # Get document by ID - >>> view = mmif['v1'] # Get view by ID - >>> ann = mmif['v1:a1'] # Get annotation by long-form ID - >>> - >>> # Safe access with default: - >>> doc = mmif.get('m99', default=None) + .. code-block:: python + + mmif = Mmif(mmif_json) + doc = mmif['m1'] # Get document by ID + view = mmif['v1'] # Get view by ID + ann = mmif['v1:a1'] # Get annotation by long-form ID + + # Safe access with default: + doc = mmif.get('m99', default=None) Accessing via underlying lists (positional access): - >>> first_doc = mmif.documents[0] # First document - >>> last_view = mmif.views[-1] # Last view - >>> all_views = mmif.views[1:4] # Slice of views + .. code-block:: python + + first_doc = mmif.documents[0] # First document + last_view = mmif.views[-1] # Last view + all_views = mmif.views[1:4] # Slice of views """ def __init__(self, mmif_obj: Optional[Union[bytes, str, dict]] = None, *, validate: bool = True) -> None: @@ -823,27 +827,31 @@ def __getitem__(self, item: str) -> Union[Document, View, Annotation, MmifMetada -------- High-level access by ID: - >>> mmif = Mmif(mmif_json) - >>> - >>> # Access documents: - >>> doc = mmif['m1'] # Returns Document with ID 'm1' - >>> - >>> # Access views: - >>> view = mmif['v1'] # Returns View with ID 'v1' - >>> - >>> # Access annotations (long-form ID): - >>> ann = mmif['v1:a1'] # Returns Annotation from view v1 - >>> - >>> # Access attributes: - >>> metadata = mmif['metadata'] # Returns MmifMetadata object - >>> - >>> # Will raise KeyError: - >>> doc = mmif['nonexistent'] # KeyError! + .. code-block:: python + + mmif = Mmif(mmif_json) + + # Access documents: + doc = mmif['m1'] # Returns Document with ID 'm1' + + # Access views: + view = mmif['v1'] # Returns View with ID 'v1' + + # Access annotations (long-form ID): + ann = mmif['v1:a1'] # Returns Annotation from view v1 + + # Access attributes: + metadata = mmif['metadata'] # Returns MmifMetadata object + + # Will raise KeyError: + doc = mmif['nonexistent'] # KeyError! For list-style positional access, use the underlying collections: - >>> first_doc = mmif.documents[0] # Integer index - >>> second_view = mmif.views[1] # Integer index + .. code-block:: python + + first_doc = mmif.documents[0] # Integer index + second_view = mmif.views[1] # Integer index See Also -------- diff --git a/mmif/serialize/model.py b/mmif/serialize/model.py index b052b0fb..1bec7b29 100644 --- a/mmif/serialize/model.py +++ b/mmif/serialize/model.py @@ -370,13 +370,15 @@ def get(self, obj_id, default=None): -------- Safe access pattern (works on all MmifObject subclasses): - >>> # On Mmif objects: - >>> view = mmif.get('v1', default=None) # Returns None if not found - >>> doc = mmif.get('doc1', default=None) - >>> - >>> # On Annotation/Document objects: - >>> label = annotation.get('label', default='unknown') - >>> author = document.get('author', default='anonymous') + .. code-block:: python + + # On Mmif objects: + view = mmif.get('v1', default=None) # Returns None if not found + doc = mmif.get('doc1', default=None) + + # On Annotation/Document objects: + label = annotation.get('label', default='unknown') + author = document.get('author', default='anonymous') See Also -------- @@ -463,16 +465,20 @@ def get(self, key: str, default=None) -> Optional[T]: -------- Old pattern (deprecated, do not use): - >>> view = mmif.views.get('v1') # DeprecationWarning! + .. code-block:: python + + view = mmif.views.get('v1') # DeprecationWarning! New patterns to use instead: - >>> # For ID-based access, use container: - >>> view = mmif['v1'] - >>> # Or with safe access: - >>> view = mmif.get('v1', default=None) - >>> # For positional access: - >>> view = mmif.views[0] + .. code-block:: python + + # For ID-based access, use container: + view = mmif['v1'] + # Or with safe access: + view = mmif.get('v1', default=None) + # For positional access: + view = mmif.views[0] See Also -------- @@ -527,20 +533,22 @@ def __getitem__(self, key: Union[int, slice]) -> Union[T, List[T]]: -------- Positional access (pythonic list behavior): - >>> # Get first view: - >>> first_view = mmif.views[0] - >>> - >>> # Get last document: - >>> last_doc = mmif.documents[-1] - >>> - >>> # Slice to get multiple elements: - >>> first_three_views = mmif.views[0:3] - >>> - >>> # This will raise TypeError: - >>> view = mmif.views['v1'] # TypeError! - >>> - >>> # For ID-based access, use container: - >>> view = mmif['v1'] # Correct way + .. code-block:: python + + # Get first view: + first_view = mmif.views[0] + + # Get last document: + last_doc = mmif.documents[-1] + + # Slice to get multiple elements: + first_three_views = mmif.views[0:3] + + # This will raise TypeError: + view = mmif.views['v1'] # TypeError! + + # For ID-based access, use container: + view = mmif['v1'] # Correct way """ if isinstance(key, (int, slice)): # Python's dicts preserve insertion order since 3.7. @@ -596,13 +604,15 @@ def get(self, key: T, default=None) -> Optional[S]: Examples -------- - >>> # Access contains metadata: - >>> timeframe_meta = view.metadata.contains.get(AnnotationTypes.TimeFrame) - >>> if timeframe_meta is None: - ... print("No TimeFrame annotations in this view") - >>> - >>> # With custom default: - >>> value = some_dict.get('key', default={}) + .. code-block:: python + + # Access contains metadata: + timeframe_meta = view.metadata.contains.get(AnnotationTypes.TimeFrame) + if timeframe_meta is None: + print("No TimeFrame annotations in this view") + + # With custom default: + value = some_dict.get('key', default={}) """ return self._items.get(key, default) diff --git a/mmif/serialize/view.py b/mmif/serialize/view.py index 9b419b80..80b7a65b 100644 --- a/mmif/serialize/view.py +++ b/mmif/serialize/view.py @@ -45,18 +45,22 @@ class View(MmifObject): -------- Accessing annotations by ID (high-level, convenient): - >>> view = mmif['v1'] - >>> ann = view['v1:a1'] # Get annotation by ID - >>> doc = view['v1:td1'] # Get document by ID - >>> - >>> # Safe access with default: - >>> ann = view.get('v1:a999', default=None) + .. code-block:: python + + view = mmif['v1'] + ann = view['v1:a1'] # Get annotation by ID + doc = view['v1:td1'] # Get document by ID + + # Safe access with default: + ann = view.get('v1:a999', default=None) Accessing via underlying list (positional access): - >>> first_ann = view.annotations[0] # First annotation - >>> last_ann = view.annotations[-1] # Last annotation - >>> some_anns = view.annotations[1:5] # Slice of annotations + .. code-block:: python + + first_ann = view.annotations[0] # First annotation + last_ann = view.annotations[-1] # Last annotation + some_anns = view.annotations[1:5] # Slice of annotations """ def __init__(self, view_obj: Optional[Union[bytes, str, dict]] = None, parent_mmif=None, *_) -> None: @@ -340,23 +344,27 @@ def __getitem__(self, key: str) -> 'Annotation': -------- High-level access by ID: - >>> view = mmif['v1'] - >>> - >>> # Access annotations: - >>> ann = view['v1:a1'] # Returns Annotation with ID 'v1:a1' - >>> - >>> # Access text documents in view: - >>> doc = view['v1:td1'] # Returns Document with ID 'v1:td1' - >>> - >>> # Access attributes: - >>> metadata = view['metadata'] # Returns ViewMetadata object - >>> - >>> # Will raise KeyError: - >>> ann = view['nonexistent'] # KeyError! + .. code-block:: python + + view = mmif['v1'] + + # Access annotations: + ann = view['v1:a1'] # Returns Annotation with ID 'v1:a1' + + # Access text documents in view: + doc = view['v1:td1'] # Returns Document with ID 'v1:td1' + + # Access attributes: + metadata = view['metadata'] # Returns ViewMetadata object + + # Will raise KeyError: + ann = view['nonexistent'] # KeyError! For list-style positional access, use the underlying collection: - >>> first_ann = view.annotations[0] # Integer index + .. code-block:: python + + first_ann = view.annotations[0] # Integer index See Also -------- From c4f1fb09b1c4bdb0589c73e3cfc912dcde61b9de Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Thu, 20 Nov 2025 11:20:17 -0500 Subject: [PATCH 11/14] added basic `mmif describe` cli a --- mmif/utils/cli/__init__.py | 1 + mmif/utils/cli/describe.py | 299 +++++++++++++++++++++++++++++++++++++ 2 files changed, 300 insertions(+) create mode 100644 mmif/utils/cli/describe.py diff --git a/mmif/utils/cli/__init__.py b/mmif/utils/cli/__init__.py index 77f6b74b..24855994 100644 --- a/mmif/utils/cli/__init__.py +++ b/mmif/utils/cli/__init__.py @@ -1,3 +1,4 @@ +from mmif.utils.cli import describe from mmif.utils.cli import rewind from mmif.utils.cli import source diff --git a/mmif/utils/cli/describe.py b/mmif/utils/cli/describe.py new file mode 100644 index 00000000..92e26864 --- /dev/null +++ b/mmif/utils/cli/describe.py @@ -0,0 +1,299 @@ +import argparse +import hashlib +import json +import sys +import textwrap +from pathlib import Path +from typing import Union, List, Tuple, Optional + +from mmif import Mmif + + +def split_appname_appversion(long_app_id: str) -> Tuple[str, str]: + """ + Split app name and version from a long app identifier. + + Assumes the identifier looks like "uri://APP_DOMAIN/APP_NAME/APP_VERSION" + + :param long_app_id: Full app identifier URI + :return: Tuple of (app_name, app_version) + """ + app_path = Path(long_app_id).parts + app_name = app_path[2] if len(app_path) > 2 else None + app_version = app_path[3] if len(app_path) > 3 else None + if app_version is not None and app_name.endswith(app_version): + app_name = app_name[:-len(app_version) - 1] + if app_version == 'unresolvable': + app_version = None + return app_name, app_version + + +def generate_param_hash(params: dict) -> str: + """ + Generate MD5 hash from a parameter dictionary. + + Parameters are sorted alphabetically, joined as key=value pairs, + and hashed using MD5. This is not for security purposes, only for + generating consistent identifiers. + + :param params: Dictionary of parameters + :return: MD5 hash string (32 hex characters) + """ + if not params: + param_string = "" + else: + param_list = ['='.join([k, str(v)]) for k, v in params.items()] + param_list.sort() + param_string = ','.join(param_list) + return hashlib.md5(param_string.encode('utf-8')).hexdigest() + + +def get_pipeline_specs( + mmif_file: Union[str, Path] +) -> List[Tuple[str, str, dict, Optional[int], Optional[dict], int, dict]]: + """ + Read a MMIF file and extract the pipeline specification from it. + + :param mmif_file: Path to the MMIF file + :return: List of tuples containing (view_id, app_name, configs, + running_time_ms, running_hardware, annotation_count, + annotations_by_type) for each view in the pipeline + """ + if not isinstance(mmif_file, (str, Path)): + raise ValueError( + "MMIF file path must be a string or a Path object." + ) + + with open(mmif_file, "r") as f: + mmif_str = f.read() + + data = Mmif(mmif_str) + spec = [] + + for view in data.views: + # Skip views with errors, warnings, or no annotations + if view.has_error() or view.has_warnings(): + continue + elif len(view.annotations) == 0: + continue + + app = view.metadata.get("app") + configs = view.metadata.get("appConfiguration", {}) + + # Parse running time from hh:mm:ss.ms format to milliseconds + # Support both new (appProfiling.runningTime) and old (appRunningTime) + running_time = None + time_str = None + if "appProfiling" in view.metadata: + profiling = view.metadata["appProfiling"] + if isinstance(profiling, dict) and "runningTime" in profiling: + time_str = profiling["runningTime"] + elif "appRunningTime" in view.metadata: + time_str = view.metadata["appRunningTime"] + + if time_str: + try: + rest, ms = time_str.split(".") + running_time = sum( + int(x) * 60**i + for i, x in enumerate(reversed(rest.split(":"))) + ) * 1000 + int(ms) + except (ValueError, AttributeError): + # If parsing fails, leave as None + pass + + # Support both new (appProfiling.hardware) and old (appRunningHardware) + running_hardware = None + if "appProfiling" in view.metadata: + profiling = view.metadata["appProfiling"] + if isinstance(profiling, dict) and "hardware" in profiling: + running_hardware = profiling["hardware"] + elif "appRunningHardware" in view.metadata: + running_hardware = view.metadata["appRunningHardware"] + + # Count annotations and group by type + annotation_count = len(view.annotations) + annotations_by_type = {} + for annotation in view.annotations: + at_type = str(annotation.at_type) + annotations_by_type[at_type] = annotations_by_type.get( + at_type, 0 + ) + 1 + + spec.append(( + view.id, app, configs, running_time, running_hardware, + annotation_count, annotations_by_type + )) + + return spec + + +def generate_pipeline_identifier(mmif_file: Union[str, Path]) -> str: + """ + Generate a pipeline identifier string from a MMIF file. + + The identifier follows the storage directory structure format: + app_name/version/param_hash/app_name2/version2/param_hash2/... + + Uses view.metadata.parameters (raw user-passed values) for hashing + to ensure reproducibility. + + :param mmif_file: Path to the MMIF file + :return: Pipeline identifier string + """ + if not isinstance(mmif_file, (str, Path)): + raise ValueError( + "MMIF file path must be a string or a Path object." + ) + + with open(mmif_file, "r") as f: + mmif_str = f.read() + + data = Mmif(mmif_str) + segments = [] + + for view in data.views: + # Skip views with errors, warnings, or no annotations + if view.has_error() or view.has_warnings(): + continue + elif len(view.annotations) == 0: + continue + + app = view.metadata.get("app") + app_name, app_version = split_appname_appversion(app) + + # Use raw parameters for reproducibility + try: + param_dict = view.metadata.parameters + except (KeyError, AttributeError): + param_dict = {} + + param_hash = generate_param_hash(param_dict) + + # Build segment: app_name/version/hash + version_str = app_version if app_version else "unversioned" + segments.append(f"{app_name}/{version_str}/{param_hash}") + + return '/'.join(segments) + + +def describe_argparser(): + """ + Returns two strings: one-line description of the argparser, and + additional material, which will be shown in `clams --help` and + `clams --help`, respectively. + """ + oneliner = ( + 'provides CLI to describe the pipeline specification from a MMIF ' + 'file.' + ) + additional = textwrap.dedent(""" + MMIF describe extracts pipeline information from a MMIF file, including + app names, runtime configurations, and runtime profiling statistics + for each view in the processing pipeline.""") + return oneliner, oneliner + '\n\n' + additional + + +def prep_argparser(**kwargs): + parser = argparse.ArgumentParser( + description=describe_argparser()[1], + formatter_class=argparse.RawDescriptionHelpFormatter, + **kwargs + ) + parser.add_argument( + "MMIF_FILE", + nargs="?", + type=argparse.FileType("r"), + default=None if sys.stdin.isatty() else sys.stdin, + help='input MMIF file path, or STDIN if `-` or not provided.' + ) + parser.add_argument( + "-o", "--output", + type=argparse.FileType("w"), + default=sys.stdout, + help='output file path, or STDOUT if not provided.' + ) + parser.add_argument( + "-p", "--pretty", + action="store_true", + help="Pretty-print JSON output" + ) + return parser + + +def main(args): + # Read MMIF content + mmif_content = args.MMIF_FILE.read() + + # For file input, we need to handle the path + # If input is from stdin, create a temp file + import tempfile + with tempfile.NamedTemporaryFile( + mode='w', suffix='.mmif', delete=False + ) as tmp: + tmp.write(mmif_content) + tmp_path = tmp.name + + try: + spec = get_pipeline_specs(tmp_path) + pipeline_id = generate_pipeline_identifier(tmp_path) + + # Convert to JSON-serializable format and calculate stats + views = {} + annotation_count_stats = {"total": 0} + annotation_count_by_type = {} + + for (view_id, app, configs, running_time, running_hardware, + annotation_count, annotations_by_type) in spec: + entry = { + "app": app, + "appConfiguration": configs, + } + # Output in new appProfiling format + if running_time is not None or running_hardware is not None: + profiling = {} + if running_time is not None: + profiling["runningTime"] = running_time + if running_hardware is not None: + profiling["hardware"] = running_hardware + entry["appProfiling"] = profiling + + views[view_id] = entry + + # Build annotation count stats + annotation_count_stats["total"] += annotation_count + annotation_count_stats[view_id] = annotation_count + + # Build annotation count by type stats + for at_type, count in annotations_by_type.items(): + if at_type not in annotation_count_by_type: + annotation_count_by_type[at_type] = {"total": 0} + annotation_count_by_type[at_type]["total"] += count + annotation_count_by_type[at_type][view_id] = count + + output = { + "pipeline_id": pipeline_id, + "stats": { + "viewCount": len(views), + "annotationCount": annotation_count_stats, + "annotationCountByType": annotation_count_by_type + }, + "views": views + } + + # Write output + if args.pretty: + json.dump(output, args.output, indent=2) + else: + json.dump(output, args.output) + args.output.write('\n') + finally: + # Clean up temp file + import os + os.unlink(tmp_path) + + +if __name__ == "__main__": + parser = prep_argparser() + args = parser.parse_args() + main(args) From 9348b6832aa4c03a7a2895ef95da295df404e421 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Thu, 20 Nov 2025 11:59:03 -0500 Subject: [PATCH 12/14] updated other CLI modules' argparser for consistency a --- mmif/utils/cli/rewind.py | 12 ++++++------ mmif/utils/cli/source.py | 13 ++++--------- tests/test_utils_cli.py | 8 ++++---- 3 files changed, 14 insertions(+), 19 deletions(-) diff --git a/mmif/utils/cli/rewind.py b/mmif/utils/cli/rewind.py index 570b0893..e5bd4ed3 100644 --- a/mmif/utils/cli/rewind.py +++ b/mmif/utils/cli/rewind.py @@ -76,14 +76,14 @@ def describe_argparser(): def prep_argparser(**kwargs): parser = argparse.ArgumentParser(description=describe_argparser()[1], formatter_class=argparse.RawDescriptionHelpFormatter, **kwargs) - parser.add_argument("IN_MMIF_FILE", + parser.add_argument("MMIF_FILE", nargs="?", type=argparse.FileType("r"), default=None if sys.stdin.isatty() else sys.stdin, help='input MMIF file path, or STDIN if `-` or not provided.') - parser.add_argument("OUT_MMIF_FILE", - nargs="?", type=argparse.FileType("w"), + parser.add_argument("-o", "--output", + type=argparse.FileType("w"), default=sys.stdout, - help='output MMIF file path, or STDOUT if `-` or not provided.') + help='output file path, or STDOUT if not provided.') parser.add_argument("-p", '--pretty', action='store_true', help="Pretty-print rewound MMIF") parser.add_argument("-n", '--number', default="0", type=int, @@ -95,7 +95,7 @@ def prep_argparser(**kwargs): def main(args): - mmif_obj = mmif.Mmif(args.IN_MMIF_FILE.read()) + mmif_obj = mmif.Mmif(args.MMIF_FILE.read()) if args.number == 0: # If user doesn't know how many views to rewind, give them choices. choice = prompt_user(mmif_obj) @@ -104,7 +104,7 @@ def main(args): if not isinstance(choice, int) or choice <= 0: raise ValueError(f"Only can rewind by a positive number of views. Got {choice}.") - args.OUT_MMIF_FILE.write(rewind_mmif(mmif_obj, choice, args.mode == 'view').serialize(pretty=args.pretty)) + args.output.write(rewind_mmif(mmif_obj, choice, args.mode == 'view').serialize(pretty=args.pretty)) if __name__ == "__main__": diff --git a/mmif/utils/cli/source.py b/mmif/utils/cli/source.py index 7e662f9f..1e6b6b1b 100644 --- a/mmif/utils/cli/source.py +++ b/mmif/utils/cli/source.py @@ -258,10 +258,9 @@ def prep_argparser(**kwargs): ) parser.add_argument( '-o', '--output', - default=None, - action='store', - nargs='?', - help='A name of a file to capture a generated MMIF json. When not given, MMIF is printed to stdout.' + type=argparse.FileType('w'), + default=sys.stdout, + help='output file path, or STDOUT if not provided.' ) scheme_help = 'A scheme to associate with the document location URI. When not given, the default scheme is `file://`.' if len(discovered_docloc_plugins) > 0: @@ -279,12 +278,8 @@ def prep_argparser(**kwargs): def main(args): - if args.output: - out_f = open(args.output, 'w') - else: - out_f = sys.stdout mmif = generate_source_mmif_from_file(windows_path=False, **vars(args)) - out_f.write(mmif) + args.output.write(mmif) return mmif if __name__ == '__main__': diff --git a/tests/test_utils_cli.py b/tests/test_utils_cli.py index adb8e857..32f4c8bc 100644 --- a/tests/test_utils_cli.py +++ b/tests/test_utils_cli.py @@ -44,11 +44,11 @@ def get_params(self): return params def generate_source_mmif(self): - - # to suppress output (otherwise, set to stdout by default + + # to suppress output (otherwise, set to stdout by default) args = self.parser.parse_args(self.get_params()) - args.output = os.devnull - + args.output = open(os.devnull, 'w') + return source.main(args) def test_accept_file_paths(self): From 43e2e140482425ac33765c250eb5c3527ecee8c7 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Thu, 20 Nov 2025 11:34:52 -0500 Subject: [PATCH 13/14] updated handling of "null"-views in `describe` module --- mmif/utils/cli/describe.py | 68 ++++++++++++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 14 deletions(-) diff --git a/mmif/utils/cli/describe.py b/mmif/utils/cli/describe.py index 92e26864..a8fa6530 100644 --- a/mmif/utils/cli/describe.py +++ b/mmif/utils/cli/describe.py @@ -50,14 +50,23 @@ def generate_param_hash(params: dict) -> str: def get_pipeline_specs( mmif_file: Union[str, Path] -) -> List[Tuple[str, str, dict, Optional[int], Optional[dict], int, dict]]: +) -> Tuple[ + List[Tuple[str, str, dict, Optional[int], Optional[dict], int, dict]], + List[str], List[str], List[str] +]: """ Read a MMIF file and extract the pipeline specification from it. + Extracts app configurations, profiling data, and annotation statistics + for each contentful view. Views with errors, warnings, or no annotations + are tracked separately. + :param mmif_file: Path to the MMIF file - :return: List of tuples containing (view_id, app_name, configs, + :return: Tuple of (spec_list, error_views, warning_views, empty_views) + where spec_list contains tuples of (view_id, app_name, configs, running_time_ms, running_hardware, annotation_count, - annotations_by_type) for each view in the pipeline + annotations_by_type) for each contentful view, and the three + lists contain view IDs for error/warning/empty views respectively """ if not isinstance(mmif_file, (str, Path)): raise ValueError( @@ -69,12 +78,20 @@ def get_pipeline_specs( data = Mmif(mmif_str) spec = [] + error_views = [] + warning_views = [] + empty_views = [] for view in data.views: - # Skip views with errors, warnings, or no annotations - if view.has_error() or view.has_warnings(): + # Track error, warning, and empty views (mutually exclusive) + if view.has_error(): + error_views.append(view.id) + continue + elif view.has_warnings(): + warning_views.append(view.id) continue elif len(view.annotations) == 0: + empty_views.append(view.id) continue app = view.metadata.get("app") @@ -125,7 +142,7 @@ def get_pipeline_specs( annotation_count, annotations_by_type )) - return spec + return spec, error_views, warning_views, empty_views def generate_pipeline_identifier(mmif_file: Union[str, Path]) -> str: @@ -136,7 +153,8 @@ def generate_pipeline_identifier(mmif_file: Union[str, Path]) -> str: app_name/version/param_hash/app_name2/version2/param_hash2/... Uses view.metadata.parameters (raw user-passed values) for hashing - to ensure reproducibility. + to ensure reproducibility. Views with errors or warnings are excluded + from the identifier; empty views (no annotations) are included. :param mmif_file: Path to the MMIF file :return: Pipeline identifier string @@ -153,11 +171,9 @@ def generate_pipeline_identifier(mmif_file: Union[str, Path]) -> str: segments = [] for view in data.views: - # Skip views with errors, warnings, or no annotations + # Skip views with errors or warnings if view.has_error() or view.has_warnings(): continue - elif len(view.annotations) == 0: - continue app = view.metadata.get("app") app_name, app_version = split_appname_appversion(app) @@ -188,9 +204,17 @@ def describe_argparser(): 'file.' ) additional = textwrap.dedent(""" - MMIF describe extracts pipeline information from a MMIF file, including - app names, runtime configurations, and runtime profiling statistics - for each view in the processing pipeline.""") + MMIF describe extracts pipeline information from a MMIF file and outputs + a JSON summary including: + + - pipeline_id: unique identifier for the pipeline based on apps, versions, + and parameter hashes (excludes error/warning views) + - stats: annotation counts (total and per-view), counts by annotation type, + and lists of error/warning/empty view IDs + - views: map of view IDs to app configurations and profiling data + + Views with errors or warnings are tracked but excluded from the pipeline + identifier and annotation statistics.""") return oneliner, oneliner + '\n\n' + additional @@ -222,6 +246,17 @@ def prep_argparser(**kwargs): def main(args): + """ + Main entry point for the describe CLI command. + + Reads a MMIF file and outputs a JSON summary containing: + - pipeline_id: unique identifier for the pipeline + - stats: view counts, annotation counts (total/per-view/per-type), + and lists of error/warning/empty view IDs + - views: map of view IDs to app configurations and profiling data + + :param args: Parsed command-line arguments + """ # Read MMIF content mmif_content = args.MMIF_FILE.read() @@ -235,7 +270,9 @@ def main(args): tmp_path = tmp.name try: - spec = get_pipeline_specs(tmp_path) + spec, error_views, warning_views, empty_views = get_pipeline_specs( + tmp_path + ) pipeline_id = generate_pipeline_identifier(tmp_path) # Convert to JSON-serializable format and calculate stats @@ -275,6 +312,9 @@ def main(args): "pipeline_id": pipeline_id, "stats": { "viewCount": len(views), + "errorViews": error_views, + "warningViews": warning_views, + "emptyViews": empty_views, "annotationCount": annotation_count_stats, "annotationCountByType": annotation_count_by_type }, From f7eedfa6337b441c39af5b5370242ed2baf02435 Mon Sep 17 00:00:00 2001 From: Keigh Rim Date: Thu, 20 Nov 2025 11:53:28 -0500 Subject: [PATCH 14/14] fixed type hints, changed time format when profiling is found --- mmif/utils/cli/describe.py | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/mmif/utils/cli/describe.py b/mmif/utils/cli/describe.py index a8fa6530..ab95205d 100644 --- a/mmif/utils/cli/describe.py +++ b/mmif/utils/cli/describe.py @@ -9,19 +9,22 @@ from mmif import Mmif -def split_appname_appversion(long_app_id: str) -> Tuple[str, str]: +def split_appname_appversion( + long_app_id: str +) -> Tuple[Optional[str], Optional[str]]: """ Split app name and version from a long app identifier. Assumes the identifier looks like "uri://APP_DOMAIN/APP_NAME/APP_VERSION" :param long_app_id: Full app identifier URI - :return: Tuple of (app_name, app_version) + :return: Tuple of (app_name, app_version), either may be None if not found """ app_path = Path(long_app_id).parts app_name = app_path[2] if len(app_path) > 2 else None app_version = app_path[3] if len(app_path) > 3 else None - if app_version is not None and app_name.endswith(app_version): + if (app_version is not None and app_name is not None + and app_name.endswith(app_version)): app_name = app_name[:-len(app_version) - 1] if app_version == 'unresolvable': app_version = None @@ -51,7 +54,7 @@ def generate_param_hash(params: dict) -> str: def get_pipeline_specs( mmif_file: Union[str, Path] ) -> Tuple[ - List[Tuple[str, str, dict, Optional[int], Optional[dict], int, dict]], + List[Tuple[str, Optional[str], dict, Optional[str], Optional[dict], int, dict]], List[str], List[str], List[str] ]: """ @@ -97,27 +100,15 @@ def get_pipeline_specs( app = view.metadata.get("app") configs = view.metadata.get("appConfiguration", {}) - # Parse running time from hh:mm:ss.ms format to milliseconds + # Get running time string (H:MM:SS.microseconds format) # Support both new (appProfiling.runningTime) and old (appRunningTime) running_time = None - time_str = None if "appProfiling" in view.metadata: profiling = view.metadata["appProfiling"] if isinstance(profiling, dict) and "runningTime" in profiling: - time_str = profiling["runningTime"] + running_time = profiling["runningTime"] elif "appRunningTime" in view.metadata: - time_str = view.metadata["appRunningTime"] - - if time_str: - try: - rest, ms = time_str.split(".") - running_time = sum( - int(x) * 60**i - for i, x in enumerate(reversed(rest.split(":"))) - ) * 1000 + int(ms) - except (ValueError, AttributeError): - # If parsing fails, leave as None - pass + running_time = view.metadata["appRunningTime"] # Support both new (appProfiling.hardware) and old (appRunningHardware) running_hardware = None @@ -176,6 +167,8 @@ def generate_pipeline_identifier(mmif_file: Union[str, Path]) -> str: continue app = view.metadata.get("app") + if app is None: + continue app_name, app_version = split_appname_appversion(app) # Use raw parameters for reproducibility @@ -187,8 +180,9 @@ def generate_pipeline_identifier(mmif_file: Union[str, Path]) -> str: param_hash = generate_param_hash(param_dict) # Build segment: app_name/version/hash + name_str = app_name if app_name else "unknown" version_str = app_version if app_version else "unversioned" - segments.append(f"{app_name}/{version_str}/{param_hash}") + segments.append(f"{name_str}/{version_str}/{param_hash}") return '/'.join(segments)