Labelbox · rishisurana-labelbox · Sep 3, 2025 · Sep 8, 2025 · Sep 8, 2025 · Sep 8, 2025
diff --git a/examples/annotation_import/audio.ipynb b/examples/annotation_import/audio.ipynb
@@ -27,6 +27,30 @@
       ],
       "cell_type": "markdown"
     },
+    {
+      "metadata": {},
+      "source": [
+        "<td>\n",
+        "   <a target=\"_blank\" href=\"https://labelbox.com\" ><img src=\"https://labelbox.com/blog/content/images/2021/02/logo-v4.svg\" width=256/></a>\n",
+        "</td>\n"
+      ],
+      "cell_type": "markdown"
+    },
+    {
+      "metadata": {},
+      "source": [
+        "<td>\n",
+        "<a href=\"https://colab.research.google.com/github/Labelbox/labelbox-python/blob/develop/examples/annotation_import/audio.ipynb\" target=\"_blank\"><img\n",
+        "src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"></a>\n",
+        "</td>\n",
+        "\n",
+        "<td>\n",
+        "<a href=\"https://github.com/Labelbox/labelbox-python/tree/develop/examples/annotation_import/audio.ipynb\" target=\"_blank\"><img\n",
+        "src=\"https://img.shields.io/badge/GitHub-100000?logo=github&logoColor=white\" alt=\"GitHub\"></a>\n",
+        "</td>"
+      ],
+      "cell_type": "markdown"
+    },
     {
       "metadata": {},
       "source": [
@@ -170,7 +194,7 @@
     },
     {
       "metadata": {},
-      "source": "ontology_builder = lb.OntologyBuilder(classifications=[\n    lb.Classification(class_type=lb.Classification.Type.TEXT,\n                      name=\"text_audio\"),\n    lb.Classification(\n        class_type=lb.Classification.Type.CHECKLIST,\n        name=\"checklist_audio\",\n        options=[\n            lb.Option(value=\"first_checklist_answer\"),\n            lb.Option(value=\"second_checklist_answer\"),\n        ],\n    ),\n    lb.Classification(\n        class_type=lb.Classification.Type.RADIO,\n        name=\"radio_audio\",\n        options=[\n            lb.Option(value=\"first_radio_answer\"),\n            lb.Option(value=\"second_radio_answer\"),\n        ],\n    ),\n])\n\nontology = client.create_ontology(\n    \"Ontology Audio Annotations\",\n    ontology_builder.asdict(),\n    media_type=lb.MediaType.Audio,\n)",
+      "source": "ontology_builder = lb.OntologyBuilder(classifications=[\n    lb.Classification(class_type=lb.Classification.Type.TEXT,\n                      name=\"text_audio\"),\n    lb.Classification(\n        class_type=lb.Classification.Type.CHECKLIST,\n        name=\"checklist_audio\",\n        options=[\n            lb.Option(value=\"first_checklist_answer\"),\n            lb.Option(value=\"second_checklist_answer\"),\n        ],\n    ),\n    lb.Classification(\n        class_type=lb.Classification.Type.RADIO,\n        name=\"radio_audio\",\n        options=[\n            lb.Option(value=\"first_radio_answer\"),\n            lb.Option(value=\"second_radio_answer\"),\n        ],\n    ),\n    # Temporal classification for token-level annotations\n    lb.Classification(\n        class_type=lb.Classification.Type.TEXT,\n        name=\"User Speaker\",\n        scope=lb.Classification.Scope.INDEX,  # INDEX scope for temporal\n    ),\n])\n\nontology = client.create_ontology(\n    \"Ontology Audio Annotations\",\n    ontology_builder.asdict(),\n    media_type=lb.MediaType.Audio,\n)",
       "cell_type": "code",
       "outputs": [],
       "execution_count": null
@@ -223,6 +247,27 @@
       ],
       "cell_type": "markdown"
     },
+    {
+      "metadata": {},
+      "source": [
+        "\n"
+      ],
+      "cell_type": "markdown"
+    },
+    {
+      "metadata": {},
+      "source": "",
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "metadata": {},
+      "source": "",
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    },
     {
       "metadata": {},
       "source": "label = []\nlabel.append(\n    lb_types.Label(\n        data={\"global_key\": global_key},\n        annotations=[text_annotation, checklist_annotation, radio_annotation],\n    ))",
@@ -252,6 +297,29 @@
       ],
       "cell_type": "markdown"
     },
+    {
+      "metadata": {},
+      "source": [
+        "## Temporal Audio Annotations\n",
+        "\n",
+        "You can create temporal annotations for individual tokens (words) with precise timing:\n"
+      ],
+      "cell_type": "markdown"
+    },
+    {
+      "metadata": {},
+      "source": "# Define tokens with precise timing (from demo script)\ntokens_data = [\n    (\"Hello\", 586, 770),  # Hello: frames 586-770\n    (\"AI\", 771, 955),  # AI: frames 771-955\n    (\"how\", 956, 1140),  # how: frames 956-1140\n    (\"are\", 1141, 1325),  # are: frames 1141-1325\n    (\"you\", 1326, 1510),  # you: frames 1326-1510\n    (\"doing\", 1511, 1695),  # doing: frames 1511-1695\n    (\"today\", 1696, 1880),  # today: frames 1696-1880\n]\n\n# Create temporal annotations for each token\ntemporal_annotations = []\nfor token, start_frame, end_frame in tokens_data:\n    token_annotation = lb_types.AudioClassificationAnnotation(\n        frame=start_frame,\n        end_frame=end_frame,\n        name=\"User Speaker\",\n        value=lb_types.Text(answer=token),\n    )\n    temporal_annotations.append(token_annotation)\n\nprint(f\"Created {len(temporal_annotations)} temporal token annotations\")",
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "metadata": {},
+      "source": "# Create label with both regular and temporal annotations\nlabel_with_temporal = []\nlabel_with_temporal.append(\n    lb_types.Label(\n        data={\"global_key\": global_key},\n        annotations=[text_annotation, checklist_annotation, radio_annotation] +\n        temporal_annotations,\n    ))\n\nprint(\n    f\"Created label with {len(label_with_temporal[0].annotations)} total annotations\"\n)\nprint(f\"  - Regular annotations: 3\")\nprint(f\"  - Temporal annotations: {len(temporal_annotations)}\")",
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    },
     {
       "metadata": {},
       "source": [
@@ -260,6 +328,13 @@
       ],
       "cell_type": "markdown"
     },
+    {
+      "metadata": {},
+      "source": "# Upload temporal annotations via MAL\ntemporal_upload_job = lb.MALPredictionImport.create_from_objects(\n    client=client,\n    project_id=project.uid,\n    name=f\"temporal_mal_job-{str(uuid.uuid4())}\",\n    predictions=label_with_temporal,\n)\n\ntemporal_upload_job.wait_until_done()\nprint(\"Temporal upload completed!\")\nprint(\"Errors:\", temporal_upload_job.errors)\nprint(\"Status:\", temporal_upload_job.statuses)",
+      "cell_type": "code",
+      "outputs": [],
+      "execution_count": null
+    },
     {
       "metadata": {},
       "source": "# Upload our label using Model-Assisted Labeling\nupload_job = lb.MALPredictionImport.create_from_objects(\n    client=client,\n    project_id=project.uid,\n    name=f\"mal_job-{str(uuid.uuid4())}\",\n    predictions=label,\n)\n\nupload_job.wait_until_done()\nprint(\"Errors:\", upload_job.errors)\nprint(\"Status of uploads: \", upload_job.statuses)",

@@ -19,6 +19,8 @@
 from .video import MaskInstance
 from .video import VideoMaskAnnotation
 
+from .audio import AudioClassificationAnnotation
+
 from .ner import ConversationEntity
 from .ner import DocumentEntity
 from .ner import DocumentTextSelection

@@ -0,0 +1,37 @@
+from typing import Optional
+from pydantic import Field, AliasChoices
+
+from labelbox.data.annotation_types.annotation import (
+    ClassificationAnnotation,
+)
+
+
+class AudioClassificationAnnotation(ClassificationAnnotation):
+    """Audio classification for specific time range
+
+    Examples:
+    - Speaker identification from 2500ms to 4100ms
+    - Audio quality assessment for a segment
+    - Language detection for audio segments
+
+    Args:
+        name (Optional[str]): Name of the classification
+        feature_schema_id (Optional[Cuid]): Feature schema identifier
+        value (Union[Text, Checklist, Radio]): Classification value
+        start_frame (int): The frame index in milliseconds (e.g., 2500 = 2.5 seconds)
+        end_frame (Optional[int]): End frame in milliseconds (for time ranges)
+        segment_index (Optional[int]): Index of audio segment this annotation belongs to
+        extra (Dict[str, Any]): Additional metadata
+    """
+
+    start_frame: int = Field(
+        validation_alias=AliasChoices("start_frame", "frame"),
+        serialization_alias="start_frame",
+    )
+    end_frame: Optional[int] = Field(
+        default=None,
+        validation_alias=AliasChoices("end_frame", "endFrame"),
+        serialization_alias="end_frame",
+    )
+    segment_index: Optional[int] = None
+
@@ -17,11 +17,17 @@ class ClassificationAnswer(FeatureSchema, ConfidenceMixin, CustomMetricsMixin):
       Each answer can have a keyframe independent of the others.
         So unlike object annotations, classification annotations
           track keyframes at a classification answer level.
+
+    - For temporal classifications (audio/video), optional start_frame/end_frame can specify
+      the time range for this answer. Must be within root annotation's frame range.
+      Defaults to root frame range if not specified.
     """
 
     extra: Dict[str, Any] = {}
     keyframe: Optional[bool] = None
     classifications: Optional[List["ClassificationAnnotation"]] = None
+    start_frame: Optional[int] = None
+    end_frame: Optional[int] = None
 
 
 class Radio(ConfidenceMixin, CustomMetricsMixin, BaseModel):
@@ -69,8 +75,12 @@ class ClassificationAnnotation(
         classifications (Optional[List[ClassificationAnnotation]]): Optional sub classification of the annotation
         feature_schema_id (Optional[Cuid])
         value (Union[Text, Checklist, Radio])
+        start_frame (Optional[int]): Start frame for temporal classifications (audio/video). Must be within root annotation's frame range. Defaults to root start_frame if not specified.
+        end_frame (Optional[int]): End frame for temporal classifications (audio/video). Must be within root annotation's frame range. Defaults to root end_frame if not specified.
         extra (Dict[str, Any])
     """
 
     value: Union[Text, Checklist, Radio]
     message_id: Optional[str] = None
+    start_frame: Optional[int] = None
+    end_frame: Optional[int] = None
@@ -13,6 +13,7 @@
 from .metrics import ScalarMetric, ConfusionMatrixMetric
 from .video import VideoClassificationAnnotation
 from .video import VideoObjectAnnotation, VideoMaskAnnotation
+from .audio import AudioClassificationAnnotation
 from .mmc import MessageEvaluationTaskAnnotation
 from pydantic import BaseModel, field_validator
 
@@ -44,6 +45,7 @@ class Label(BaseModel):
             ClassificationAnnotation,
             ObjectAnnotation,
             VideoMaskAnnotation,
+            AudioClassificationAnnotation,
             ScalarMetric,
             ConfusionMatrixMetric,
             RelationshipAnnotation,
@@ -75,15 +77,23 @@ def _get_annotations_by_type(self, annotation_type):
 
     def frame_annotations(
         self,
-    ) -> Dict[str, Union[VideoObjectAnnotation, VideoClassificationAnnotation]]:
+    ) -> Dict[int, Union[VideoObjectAnnotation, VideoClassificationAnnotation, AudioClassificationAnnotation]]:
+        """Get temporal annotations organized by frame
+
+        Returns:
+            Dict[int, List]: Dictionary mapping frame (milliseconds) to list of temporal annotations
+
+        Example:
+            >>> label.frame_annotations()
+            {2500: [VideoClassificationAnnotation(...), AudioClassificationAnnotation(...)]}
+        """
         frame_dict = defaultdict(list)
         for annotation in self.annotations:
-            if isinstance(
-                annotation,
-                (VideoObjectAnnotation, VideoClassificationAnnotation),
-            ):
+            if isinstance(annotation, (VideoObjectAnnotation, VideoClassificationAnnotation)):
                 frame_dict[annotation.frame].append(annotation)
-        return frame_dict
+            elif isinstance(annotation, AudioClassificationAnnotation):
+                frame_dict[annotation.start_frame].append(annotation)
+        return dict(frame_dict)
 
     def add_url_to_masks(self, signer) -> "Label":
         """

@@ -2,7 +2,7 @@
 import copy
 from itertools import groupby
 from operator import itemgetter
-from typing import Generator, List, Tuple, Union
+from typing import Any, Dict, Generator, List, Tuple, Union
 from uuid import uuid4
 
 from pydantic import BaseModel
@@ -24,6 +24,11 @@
     VideoMaskAnnotation,
     VideoObjectAnnotation,
 )
+from typing import List
+from ...annotation_types.audio import (
+    AudioClassificationAnnotation,
+)
+from .temporal import create_audio_ndjson_annotations
 from labelbox.types import DocumentRectangle, DocumentEntity
 from .classification import (
     NDChecklistSubclass,
@@ -69,6 +74,7 @@ def from_common(
             yield from cls._create_relationship_annotations(label)
             yield from cls._create_non_video_annotations(label)
             yield from cls._create_video_annotations(label)
+            yield from cls._create_audio_annotations(label)
 
     @staticmethod
     def _get_consecutive_frames(
@@ -80,6 +86,7 @@ def _get_consecutive_frames(
             consecutive.append((group[0], group[-1]))
         return consecutive
 
+
     @classmethod
     def _get_segment_frame_ranges(
         cls,
@@ -159,6 +166,32 @@ def _create_video_annotations(
                     segments.append(segment)
                 yield NDObject.from_common(segments, label.data)
 
+    @classmethod
+    def _create_audio_annotations(
+        cls, label: Label
+    ) -> Generator[BaseModel, None, None]:
+        """Create audio annotations with nested classifications using modular hierarchy builder."""
+        # Extract audio annotations from the label
+        audio_annotations = [
+            annot for annot in label.annotations 
+            if isinstance(annot, AudioClassificationAnnotation)
+        ]
+
+        if not audio_annotations:
+            return
+
+        # Use the modular hierarchy builder to create NDJSON annotations
+        ndjson_annotations = create_audio_ndjson_annotations(
+            audio_annotations, 
+            label.data.global_key
+        )
+
+        # Yield each NDJSON annotation
+        for annotation in ndjson_annotations:
+            yield annotation
+
+
+
     @classmethod
     def _create_non_video_annotations(cls, label: Label):
         non_video_annotations = [
@@ -170,6 +203,7 @@ def _create_non_video_annotations(cls, label: Label):
                     VideoClassificationAnnotation,
                     VideoObjectAnnotation,
                     VideoMaskAnnotation,
+                    AudioClassificationAnnotation,
                     RelationshipAnnotation,
                 ),
             )
@@ -187,7 +221,7 @@ def _create_non_video_annotations(cls, label: Label):
                 yield NDMessageTask.from_common(annotation, label.data)
             else:
                 raise TypeError(
-                    f"Unable to convert object to MAL format. `{type(getattr(annotation, 'value',annotation))}`"
+                    f"Unable to convert object to MAL format. `{type(getattr(annotation, 'value', annotation))}`"
                 )
 
     @classmethod