chore: remove audio object annotation

rishisurana-labelbox · rishisurana-labelbox · commit 78615372a90c · 2025-09-22T13:58:50.000-07:00
diff --git a/libs/labelbox/src/labelbox/data/annotation_types/__init__.py b/libs/labelbox/src/labelbox/data/annotation_types/__init__.py
@@ -20,7 +20,6 @@
 from .video import VideoMaskAnnotation
 
 from .audio import AudioClassificationAnnotation
-from .audio import AudioObjectAnnotation
 
 from .ner import ConversationEntity
 from .ner import DocumentEntity
diff --git a/libs/labelbox/src/labelbox/data/annotation_types/audio.py b/libs/labelbox/src/labelbox/data/annotation_types/audio.py
@@ -2,11 +2,6 @@
 
 from labelbox.data.annotation_types.annotation import (
     ClassificationAnnotation,
-    ObjectAnnotation,
-)
-from labelbox.data.mixins import (
-    ConfidenceNotSupportedMixin,
-    CustomMetricsNotSupportedMixin,
 )
 
 
@@ -33,31 +28,3 @@ class AudioClassificationAnnotation(ClassificationAnnotation):
     segment_index: Optional[int] = None
 
 
-class AudioObjectAnnotation(
-    ObjectAnnotation,
-    ConfidenceNotSupportedMixin,
-    CustomMetricsNotSupportedMixin,
-):
-    """Audio object annotation for specific time range
-
-    Examples:
-    - Transcription: "Hello world" from 2500ms to 4100ms
-    - Sound events: "Dog barking" from 10000ms to 12000ms
-    - Audio segments with metadata
-
-    Args:
-        name (Optional[str]): Name of the annotation
-        feature_schema_id (Optional[Cuid]): Feature schema identifier
-        value (Union[TextEntity, Geometry]): Localization or text content
-        start_frame (int): The frame index in milliseconds (e.g., 10000 = 10.0 seconds)
-        end_frame (Optional[int]): End frame in milliseconds (for time ranges)
-        keyframe (bool): Whether this is a keyframe annotation (default: True)
-        segment_index (Optional[int]): Index of audio segment this annotation belongs to
-        classifications (Optional[List[ClassificationAnnotation]]): Optional sub-classifications
-        extra (Dict[str, Any]): Additional metadata
-    """
-
-    start_frame: int
-    end_frame: Optional[int] = None
-    keyframe: bool = True
-    segment_index: Optional[int] = None
diff --git a/libs/labelbox/src/labelbox/data/annotation_types/label.py b/libs/labelbox/src/labelbox/data/annotation_types/label.py
@@ -13,7 +13,7 @@
 from .metrics import ScalarMetric, ConfusionMatrixMetric
 from .video import VideoClassificationAnnotation
 from .video import VideoObjectAnnotation, VideoMaskAnnotation
-from .audio import AudioClassificationAnnotation, AudioObjectAnnotation
+from .audio import AudioClassificationAnnotation
 from .mmc import MessageEvaluationTaskAnnotation
 from pydantic import BaseModel, field_validator
 
@@ -46,7 +46,6 @@ class Label(BaseModel):
             ObjectAnnotation,
             VideoMaskAnnotation,
             AudioClassificationAnnotation,
-            AudioObjectAnnotation,
             ScalarMetric,
             ConfusionMatrixMetric,
             RelationshipAnnotation,
@@ -91,7 +90,7 @@ def frame_annotations(
     def audio_annotations_by_frame(
         self,
     ) -> Dict[
-        int, List[Union[AudioObjectAnnotation, AudioClassificationAnnotation]]
+        int, List[AudioClassificationAnnotation]
     ]:
         """Get audio annotations organized by frame (millisecond)
 
@@ -100,15 +99,15 @@ def audio_annotations_by_frame(
 
         Example:
             >>> label.audio_annotations_by_frame()
-            {2500: [AudioClassificationAnnotation(...)], 10000: [AudioObjectAnnotation(...)]}
+            {2500: [AudioClassificationAnnotation(...)]}
         """
         frame_dict = defaultdict(list)
         for annotation in self.annotations:
             if isinstance(
                 annotation,
-                (AudioObjectAnnotation, AudioClassificationAnnotation),
+                AudioClassificationAnnotation,
             ):
-                frame_dict[annotation.frame].append(annotation)
+                frame_dict[annotation.start_frame].append(annotation)
         return dict(frame_dict)
 
     def add_url_to_masks(self, signer) -> "Label":
diff --git a/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py b/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py
@@ -27,7 +27,6 @@
 from typing import List
 from ...annotation_types.audio import (
     AudioClassificationAnnotation,
-    AudioObjectAnnotation,
 )
 from labelbox.types import DocumentRectangle, DocumentEntity
 from .classification import (
@@ -87,7 +86,7 @@ def _get_consecutive_frames(
         return consecutive
 
     @classmethod
-    def _get_audio_frame_ranges(cls, annotation_group: List[Union[AudioClassificationAnnotation, AudioObjectAnnotation]]) -> List[Tuple[int, int]]:
+    def _get_audio_frame_ranges(cls, annotation_group: List[AudioClassificationAnnotation]) -> List[Tuple[int, int]]:
         """Get frame ranges for audio annotations (simpler than video segments)"""
         return [(ann.start_frame, getattr(ann, 'end_frame', None) or ann.start_frame) for ann in annotation_group]
 
@@ -214,7 +213,7 @@ def _create_audio_annotations(
         
         # Collect audio annotations
         for annot in label.annotations:
-            if isinstance(annot, (AudioClassificationAnnotation, AudioObjectAnnotation)):
+            if isinstance(annot, AudioClassificationAnnotation):
                 audio_annotations[annot.feature_schema_id or annot.name].append(annot)
 
         for annotation_group in audio_annotations.values():
@@ -232,11 +231,6 @@ def _create_audio_annotations(
                     annotation.extra.update({"frames": frames_data})
                     yield NDClassification.from_common(annotation, label.data)
 
-            # Process objects
-            elif isinstance(annotation_group[0], AudioObjectAnnotation):
-                # For audio objects, process individually (simpler than video segments)
-                for annotation in annotation_group:
-                    yield NDObject.from_common(annotation, label.data)
 
 
     @classmethod
@@ -251,7 +245,6 @@ def _create_non_video_annotations(cls, label: Label):
                     VideoObjectAnnotation,
                     VideoMaskAnnotation,
                     AudioClassificationAnnotation,
-                    AudioObjectAnnotation,
                     RelationshipAnnotation,
                 ),
             )
diff --git a/libs/labelbox/src/labelbox/data/serialization/ndjson/objects.py b/libs/labelbox/src/labelbox/data/serialization/ndjson/objects.py
@@ -14,9 +14,6 @@
 from labelbox.data.annotation_types.video import (
     VideoObjectAnnotation,
 )
-from labelbox.data.annotation_types.audio import (
-    AudioObjectAnnotation,
-)
 from labelbox.data.mixins import (
     ConfidenceMixin,
     CustomMetric,
@@ -718,7 +715,6 @@ def from_common(
             ObjectAnnotation,
             List[List[VideoObjectAnnotation]],
             VideoMaskAnnotation,
-            AudioObjectAnnotation,
         ],
         data: GenericDataRowData,
     ) -> Union[
@@ -746,9 +742,6 @@ def from_common(
             return obj.from_common(**args)
         elif obj == NDVideoMasks:
             return obj.from_common(annotation, data)
-        elif isinstance(annotation, AudioObjectAnnotation):
-            # Handle audio object annotation like single video frame
-            return cls._serialize_audio_object_annotation(annotation, data)
 
         subclasses = [
             NDSubclassification.from_common(annot)
@@ -772,43 +765,6 @@ def from_common(
             **optional_kwargs,
         )
 
-    @classmethod
-    def _serialize_audio_object_annotation(
-        cls, annotation: AudioObjectAnnotation, data: GenericDataRowData
-    ):
-        """Serialize audio object annotation with temporal information
-
-        Args:
-            annotation: Audio object annotation to process
-            data: Data row data
-
-        Returns:
-            NDObject: Serialized audio object annotation
-        """
-        # Get the appropriate NDObject subclass based on the annotation value type
-        obj = cls.lookup_object(annotation)
-
-        # Process sub-classifications if any
-        subclasses = [
-            NDSubclassification.from_common(annot)
-            for annot in annotation.classifications
-        ]
-
-        # Add frame information to extra (milliseconds)
-        extra = annotation.extra.copy() if annotation.extra else {}
-        extra.update({"frame": annotation.frame})
-
-        # Create the NDObject with frame information
-        return obj.from_common(
-            str(annotation._uuid),
-            annotation.value,
-            subclasses,
-            annotation.name,
-            annotation.feature_schema_id,
-            extra,
-            data,
-        )
-
     @staticmethod
     def lookup_object(
         annotation: Union[ObjectAnnotation, List],
diff --git a/libs/labelbox/tests/data/annotation_types/test_audio.py b/libs/labelbox/tests/data/annotation_types/test_audio.py
@@ -2,7 +2,6 @@
 import labelbox.types as lb_types
 from labelbox.data.annotation_types.audio import (
     AudioClassificationAnnotation,
-    AudioObjectAnnotation,
 )
 from labelbox.data.annotation_types.classification.classification import (
     ClassificationAnswer,
@@ -71,64 +70,6 @@ def test_audio_classification_text_type():
     assert annotation.value.answer == "excellent"
 
 
-def test_audio_object_creation():
-    """Test creating audio object annotation"""
-    annotation = AudioObjectAnnotation(
-        start_frame=10000,
-        end_frame=12500,
-        name="transcription",
-        value=lb_types.TextEntity(
-            start=0, end=11
-        ),  # "Hello world" has 11 characters
-    )
-
-    assert annotation.start_frame == 10000
-    assert annotation.end_frame == 12500
-    assert annotation.keyframe is True
-    assert annotation.segment_index is None
-    assert annotation.name == "transcription"
-    assert isinstance(annotation.value, lb_types.TextEntity)
-    assert annotation.value.start == 0
-    assert annotation.value.end == 11
-
-
-def test_audio_object_creation_with_classifications():
-    """Test creating audio object with sub-classifications"""
-    sub_classification = AudioClassificationAnnotation(
-        start_frame=10000,
-        name="confidence",
-        value=Radio(answer=ClassificationAnswer(name="high")),
-    )
-
-    annotation = AudioObjectAnnotation(
-        start_frame=10000,
-        end_frame=12500,
-        name="transcription",
-        value=lb_types.TextEntity(start=0, end=11),
-        classifications=[sub_classification],
-    )
-
-    assert len(annotation.classifications) == 1
-    assert annotation.classifications[0].name == "confidence"
-    assert annotation.classifications[0].start_frame == 10000
-
-
-def test_audio_object_direct_creation():
-    """Test creating audio object directly with various options"""
-    annotation = AudioObjectAnnotation(
-        start_frame=7500,  # 7.5 seconds
-        name="sound_event",
-        value=lb_types.TextEntity(start=0, end=11),
-        keyframe=False,
-        segment_index=2,
-    )
-
-    assert annotation.start_frame == 7500
-    assert annotation.end_frame is None
-    assert annotation.keyframe is False
-    assert annotation.segment_index == 2
-
-
 def test_frame_precision():
     """Test frame values maintain precision"""
     # Test various time values in milliseconds
@@ -155,38 +96,24 @@ def test_audio_label_integration():
         value=Radio(answer=ClassificationAnswer(name="john")),
     )
 
-    transcription_annotation = AudioObjectAnnotation(
-        start_frame=1000,
-        end_frame=2000,
-        name="transcription",
-        value=lb_types.TextEntity(start=0, end=5),
-    )
-
     # Create label with audio annotations
     label = lb_types.Label(
         data={"global_key": "audio_file.mp3"},
-        annotations=[speaker_annotation, transcription_annotation],
+        annotations=[speaker_annotation],
     )
 
     # Verify annotations are accessible
-    assert len(label.annotations) == 2
+    assert len(label.annotations) == 1
 
     # Check annotation types
     audio_classifications = [
         ann
         for ann in label.annotations
         if isinstance(ann, AudioClassificationAnnotation)
     ]
-    audio_objects = [
-        ann
-        for ann in label.annotations
-        if isinstance(ann, AudioObjectAnnotation)
-    ]
 
     assert len(audio_classifications) == 1
-    assert len(audio_objects) == 1
     assert audio_classifications[0].name == "speaker"
-    assert audio_objects[0].name == "transcription"
 
 
 def test_audio_annotation_validation():
@@ -384,32 +311,3 @@ def test_temporal_annotation_grouping():
     assert annotations[1].start_frame == 1000
     assert annotations[0].end_frame == 900
     assert annotations[1].end_frame == 1900
-
-
-def test_audio_object_types():
-    """Test different types of audio object annotations"""
-    # Text entity (transcription)
-    text_obj = AudioObjectAnnotation(
-        start_frame=1000,
-        name="transcription",
-        value=TextEntity(start=0, end=5),  # "hello"
-    )
-
-    assert isinstance(text_obj.value, TextEntity)
-    assert text_obj.value.start == 0
-    assert text_obj.value.end == 5
-
-    # Test with keyframe and segment settings
-    keyframe_obj = AudioObjectAnnotation(
-        start_frame=2000,
-        end_frame=3000,
-        name="segment",
-        value=TextEntity(start=10, end=15),
-        keyframe=True,
-        segment_index=1,
-    )
-
-    assert keyframe_obj.keyframe is True
-    assert keyframe_obj.segment_index == 1
-    assert keyframe_obj.start_frame == 2000
-    assert keyframe_obj.end_frame == 3000