diff --git a/examples/README.md b/examples/README.md index 924d1017d..842286b2d 100644 --- a/examples/README.md +++ b/examples/README.md @@ -16,15 +16,25 @@ + + Projects + Open In Github + Open In Colab + Ontologies Open In Github Open In Colab - Quick Start - Open In Github - Open In Colab + Batches + Open In Github + Open In Colab + + + Custom Embeddings + Open In Github + Open In Colab Data Rows @@ -37,25 +47,15 @@ Open In Colab - Batches - Open In Github - Open In Colab - - - Projects - Open In Github - Open In Colab + Quick Start + Open In Github + Open In Colab Data Row Metadata Open In Github Open In Colab - - Custom Embeddings - Open In Github - Open In Colab - User Management Open In Github @@ -75,25 +75,25 @@ + + Export Data + Open In Github + Open In Colab + Export V1 to V2 Migration Support Open In Github Open In Colab - - Exporting to CSV - Open In Github - Open In Colab - Composite Mask Export Open In Github Open In Colab - Export Data - Open In Github - Open In Colab + Exporting to CSV + Open In Github + Open In Colab @@ -143,36 +143,11 @@ - - Tiled - Open In Github - Open In Colab - Text Open In Github Open In Colab - - PDF - Open In Github - Open In Colab - - - Video - Open In Github - Open In Colab - - - Audio - Open In Github - Open In Colab - - - Conversational - Open In Github - Open In Colab - HTML Open In Github @@ -188,11 +163,36 @@ Open In Github Open In Colab + + Video + Open In Github + Open In Colab + + + Audio + Open In Github + Open In Colab + Conversational LLM Open In Github Open In Colab + + Tiled + Open In Github + Open In Colab + + + PDF + Open In Github + Open In Colab + + + Conversational + Open In Github + Open In Colab + @@ -208,9 +208,9 @@ - Langchain - Open In Github - Open In Colab + Meta SAM + Open In Github + Open In Colab Meta SAM Video @@ -218,20 +218,20 @@ Open In Colab - Meta SAM - Open In Github - Open In Colab + Huggingface Custom Embeddings + Open In Github + Open In Colab + + + Langchain + Open In Github + Open In Colab Import YOLOv8 Annotations Open In Github Open In Colab - - Huggingface Custom Embeddings - Open In Github - Open In Colab - @@ -247,25 +247,25 @@ - Model Predictions to Project - Open In Github - Open In Colab + Custom Metrics Basics + Open In Github + Open In Colab Custom Metrics Demo Open In Github Open In Colab - - Custom Metrics Basics - Open In Github - Open In Colab - Model Slices Open In Github Open In Colab + + Model Predictions to Project + Open In Github + Open In Colab + @@ -280,25 +280,15 @@ - - HTML Predictions - Open In Github - Open In Colab - Text Predictions Open In Github Open In Colab - Video Predictions - Open In Github - Open In Colab - - - Conversational Predictions - Open In Github - Open In Colab + PDF Predictions + Open In Github + Open In Colab Geospatial Predictions @@ -306,9 +296,14 @@ Open In Colab - PDF Predictions - Open In Github - Open In Colab + Conversational Predictions + Open In Github + Open In Colab + + + Video Predictions + Open In Github + Open In Colab Image Predictions @@ -320,6 +315,11 @@ Open In Github Open In Colab + + HTML Predictions + Open In Github + Open In Colab + diff --git a/examples/annotation_import/audio.ipynb b/examples/annotation_import/audio.ipynb index 437130a9e..615ac7c86 100644 --- a/examples/annotation_import/audio.ipynb +++ b/examples/annotation_import/audio.ipynb @@ -170,7 +170,7 @@ }, { "metadata": {}, - "source": "ontology_builder = lb.OntologyBuilder(classifications=[\n lb.Classification(class_type=lb.Classification.Type.TEXT,\n name=\"text_audio\"),\n lb.Classification(\n class_type=lb.Classification.Type.CHECKLIST,\n name=\"checklist_audio\",\n options=[\n lb.Option(value=\"first_checklist_answer\"),\n lb.Option(value=\"second_checklist_answer\"),\n ],\n ),\n lb.Classification(\n class_type=lb.Classification.Type.RADIO,\n name=\"radio_audio\",\n options=[\n lb.Option(value=\"first_radio_answer\"),\n lb.Option(value=\"second_radio_answer\"),\n ],\n ),\n])\n\nontology = client.create_ontology(\n \"Ontology Audio Annotations\",\n ontology_builder.asdict(),\n media_type=lb.MediaType.Audio,\n)", + "source": "ontology_builder = lb.OntologyBuilder(classifications=[\n lb.Classification(class_type=lb.Classification.Type.TEXT,\n name=\"text_audio\"),\n lb.Classification(\n class_type=lb.Classification.Type.CHECKLIST,\n name=\"checklist_audio\",\n options=[\n lb.Option(value=\"first_checklist_answer\"),\n lb.Option(value=\"second_checklist_answer\"),\n ],\n ),\n lb.Classification(\n class_type=lb.Classification.Type.RADIO,\n name=\"radio_audio\",\n options=[\n lb.Option(value=\"first_radio_answer\"),\n lb.Option(value=\"second_radio_answer\"),\n ],\n ),\n # Temporal classification for token-level annotations\n lb.Classification(\n class_type=lb.Classification.Type.TEXT,\n name=\"User Speaker\",\n scope=lb.Classification.Scope.INDEX, # INDEX scope for temporal\n ),\n])\n\nontology = client.create_ontology(\n \"Ontology Audio Annotations\",\n ontology_builder.asdict(),\n media_type=lb.MediaType.Audio,\n)", "cell_type": "code", "outputs": [], "execution_count": null @@ -252,6 +252,29 @@ ], "cell_type": "markdown" }, + { + "metadata": {}, + "source": [ + "## Temporal Audio Annotations\n", + "\n", + "You can create temporal annotations for individual tokens (words) with precise timing:\n" + ], + "cell_type": "markdown" + }, + { + "metadata": {}, + "source": "# Define tokens with precise timing (from demo script)\ntokens_data = [\n (\"Hello\", 586, 770), # Hello: frames 586-770\n (\"AI\", 771, 955), # AI: frames 771-955\n (\"how\", 956, 1140), # how: frames 956-1140\n (\"are\", 1141, 1325), # are: frames 1141-1325\n (\"you\", 1326, 1510), # you: frames 1326-1510\n (\"doing\", 1511, 1695), # doing: frames 1511-1695\n (\"today\", 1696, 1880), # today: frames 1696-1880\n]\n\n# Create temporal annotations for each token\ntemporal_annotations = []\nfor token, start_frame, end_frame in tokens_data:\n token_annotation = lb_types.AudioClassificationAnnotation(\n frame=start_frame,\n end_frame=end_frame,\n name=\"User Speaker\",\n value=lb_types.Text(answer=token),\n )\n temporal_annotations.append(token_annotation)\n\nprint(f\"Created {len(temporal_annotations)} temporal token annotations\")", + "cell_type": "code", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "source": "# Create label with both regular and temporal annotations\nlabel_with_temporal = []\nlabel_with_temporal.append(\n lb_types.Label(\n data={\"global_key\": global_key},\n annotations=[text_annotation, checklist_annotation, radio_annotation] +\n temporal_annotations,\n ))\n\nprint(\n f\"Created label with {len(label_with_temporal[0].annotations)} total annotations\"\n)\nprint(f\" - Regular annotations: 3\")\nprint(f\" - Temporal annotations: {len(temporal_annotations)}\")", + "cell_type": "code", + "outputs": [], + "execution_count": null + }, { "metadata": {}, "source": [ @@ -260,6 +283,13 @@ ], "cell_type": "markdown" }, + { + "metadata": {}, + "source": "# Upload temporal annotations via MAL\ntemporal_upload_job = lb.MALPredictionImport.create_from_objects(\n client=client,\n project_id=project.uid,\n name=f\"temporal_mal_job-{str(uuid.uuid4())}\",\n predictions=label_with_temporal,\n)\n\ntemporal_upload_job.wait_until_done()\nprint(\"Temporal upload completed!\")\nprint(\"Errors:\", temporal_upload_job.errors)\nprint(\"Status:\", temporal_upload_job.statuses)", + "cell_type": "code", + "outputs": [], + "execution_count": null + }, { "metadata": {}, "source": "# Upload our label using Model-Assisted Labeling\nupload_job = lb.MALPredictionImport.create_from_objects(\n client=client,\n project_id=project.uid,\n name=f\"mal_job-{str(uuid.uuid4())}\",\n predictions=label,\n)\n\nupload_job.wait_until_done()\nprint(\"Errors:\", upload_job.errors)\nprint(\"Status of uploads: \", upload_job.statuses)", diff --git a/libs/labelbox/src/labelbox/data/annotation_types/__init__.py b/libs/labelbox/src/labelbox/data/annotation_types/__init__.py index fc75652cf..9f59b5197 100644 --- a/libs/labelbox/src/labelbox/data/annotation_types/__init__.py +++ b/libs/labelbox/src/labelbox/data/annotation_types/__init__.py @@ -19,6 +19,8 @@ from .video import MaskInstance from .video import VideoMaskAnnotation +from .audio import AudioClassificationAnnotation + from .ner import ConversationEntity from .ner import DocumentEntity from .ner import DocumentTextSelection diff --git a/libs/labelbox/src/labelbox/data/annotation_types/audio.py b/libs/labelbox/src/labelbox/data/annotation_types/audio.py new file mode 100644 index 000000000..c86fba668 --- /dev/null +++ b/libs/labelbox/src/labelbox/data/annotation_types/audio.py @@ -0,0 +1,37 @@ +from typing import Optional +from pydantic import Field, AliasChoices + +from labelbox.data.annotation_types.annotation import ( + ClassificationAnnotation, +) + + +class AudioClassificationAnnotation(ClassificationAnnotation): + """Audio classification for specific time range + + Examples: + - Speaker identification from 2500ms to 4100ms + - Audio quality assessment for a segment + - Language detection for audio segments + + Args: + name (Optional[str]): Name of the classification + feature_schema_id (Optional[Cuid]): Feature schema identifier + value (Union[Text, Checklist, Radio]): Classification value + start_frame (int): The frame index in milliseconds (e.g., 2500 = 2.5 seconds) + end_frame (Optional[int]): End frame in milliseconds (for time ranges) + segment_index (Optional[int]): Index of audio segment this annotation belongs to + extra (Dict[str, Any]): Additional metadata + """ + + start_frame: int = Field( + validation_alias=AliasChoices("start_frame", "frame"), + serialization_alias="start_frame", + ) + end_frame: Optional[int] = Field( + default=None, + validation_alias=AliasChoices("end_frame", "endFrame"), + serialization_alias="end_frame", + ) + segment_index: Optional[int] = None + diff --git a/libs/labelbox/src/labelbox/data/annotation_types/classification/classification.py b/libs/labelbox/src/labelbox/data/annotation_types/classification/classification.py index d6a6448dd..aca1827a9 100644 --- a/libs/labelbox/src/labelbox/data/annotation_types/classification/classification.py +++ b/libs/labelbox/src/labelbox/data/annotation_types/classification/classification.py @@ -17,11 +17,17 @@ class ClassificationAnswer(FeatureSchema, ConfidenceMixin, CustomMetricsMixin): Each answer can have a keyframe independent of the others. So unlike object annotations, classification annotations track keyframes at a classification answer level. + + - For temporal classifications (audio/video), optional start_frame/end_frame can specify + the time range for this answer. Must be within root annotation's frame range. + Defaults to root frame range if not specified. """ extra: Dict[str, Any] = {} keyframe: Optional[bool] = None classifications: Optional[List["ClassificationAnnotation"]] = None + start_frame: Optional[int] = None + end_frame: Optional[int] = None class Radio(ConfidenceMixin, CustomMetricsMixin, BaseModel): @@ -69,8 +75,12 @@ class ClassificationAnnotation( classifications (Optional[List[ClassificationAnnotation]]): Optional sub classification of the annotation feature_schema_id (Optional[Cuid]) value (Union[Text, Checklist, Radio]) + start_frame (Optional[int]): Start frame for temporal classifications (audio/video). Must be within root annotation's frame range. Defaults to root start_frame if not specified. + end_frame (Optional[int]): End frame for temporal classifications (audio/video). Must be within root annotation's frame range. Defaults to root end_frame if not specified. extra (Dict[str, Any]) """ value: Union[Text, Checklist, Radio] message_id: Optional[str] = None + start_frame: Optional[int] = None + end_frame: Optional[int] = None diff --git a/libs/labelbox/src/labelbox/data/annotation_types/label.py b/libs/labelbox/src/labelbox/data/annotation_types/label.py index d13fb8f20..228512a5d 100644 --- a/libs/labelbox/src/labelbox/data/annotation_types/label.py +++ b/libs/labelbox/src/labelbox/data/annotation_types/label.py @@ -13,6 +13,7 @@ from .metrics import ScalarMetric, ConfusionMatrixMetric from .video import VideoClassificationAnnotation from .video import VideoObjectAnnotation, VideoMaskAnnotation +from .audio import AudioClassificationAnnotation from .mmc import MessageEvaluationTaskAnnotation from pydantic import BaseModel, field_validator @@ -44,6 +45,7 @@ class Label(BaseModel): ClassificationAnnotation, ObjectAnnotation, VideoMaskAnnotation, + AudioClassificationAnnotation, ScalarMetric, ConfusionMatrixMetric, RelationshipAnnotation, @@ -75,15 +77,23 @@ def _get_annotations_by_type(self, annotation_type): def frame_annotations( self, - ) -> Dict[str, Union[VideoObjectAnnotation, VideoClassificationAnnotation]]: + ) -> Dict[int, Union[VideoObjectAnnotation, VideoClassificationAnnotation, AudioClassificationAnnotation]]: + """Get temporal annotations organized by frame + + Returns: + Dict[int, List]: Dictionary mapping frame (milliseconds) to list of temporal annotations + + Example: + >>> label.frame_annotations() + {2500: [VideoClassificationAnnotation(...), AudioClassificationAnnotation(...)]} + """ frame_dict = defaultdict(list) for annotation in self.annotations: - if isinstance( - annotation, - (VideoObjectAnnotation, VideoClassificationAnnotation), - ): + if isinstance(annotation, (VideoObjectAnnotation, VideoClassificationAnnotation)): frame_dict[annotation.frame].append(annotation) - return frame_dict + elif isinstance(annotation, AudioClassificationAnnotation): + frame_dict[annotation.start_frame].append(annotation) + return dict(frame_dict) def add_url_to_masks(self, signer) -> "Label": """ diff --git a/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py b/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py index 2f4799d13..5fc19c004 100644 --- a/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py +++ b/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py @@ -2,7 +2,7 @@ import copy from itertools import groupby from operator import itemgetter -from typing import Generator, List, Tuple, Union +from typing import Any, Dict, Generator, List, Tuple, Union from uuid import uuid4 from pydantic import BaseModel @@ -24,6 +24,11 @@ VideoMaskAnnotation, VideoObjectAnnotation, ) +from typing import List +from ...annotation_types.audio import ( + AudioClassificationAnnotation, +) +from .temporal import create_audio_ndjson_annotations from labelbox.types import DocumentRectangle, DocumentEntity from .classification import ( NDChecklistSubclass, @@ -69,6 +74,7 @@ def from_common( yield from cls._create_relationship_annotations(label) yield from cls._create_non_video_annotations(label) yield from cls._create_video_annotations(label) + yield from cls._create_audio_annotations(label) @staticmethod def _get_consecutive_frames( @@ -80,6 +86,7 @@ def _get_consecutive_frames( consecutive.append((group[0], group[-1])) return consecutive + @classmethod def _get_segment_frame_ranges( cls, @@ -159,6 +166,32 @@ def _create_video_annotations( segments.append(segment) yield NDObject.from_common(segments, label.data) + @classmethod + def _create_audio_annotations( + cls, label: Label + ) -> Generator[BaseModel, None, None]: + """Create audio annotations with nested classifications using modular hierarchy builder.""" + # Extract audio annotations from the label + audio_annotations = [ + annot for annot in label.annotations + if isinstance(annot, AudioClassificationAnnotation) + ] + + if not audio_annotations: + return + + # Use the modular hierarchy builder to create NDJSON annotations + ndjson_annotations = create_audio_ndjson_annotations( + audio_annotations, + label.data.global_key + ) + + # Yield each NDJSON annotation + for annotation in ndjson_annotations: + yield annotation + + + @classmethod def _create_non_video_annotations(cls, label: Label): non_video_annotations = [ @@ -170,6 +203,7 @@ def _create_non_video_annotations(cls, label: Label): VideoClassificationAnnotation, VideoObjectAnnotation, VideoMaskAnnotation, + AudioClassificationAnnotation, RelationshipAnnotation, ), ) @@ -187,7 +221,7 @@ def _create_non_video_annotations(cls, label: Label): yield NDMessageTask.from_common(annotation, label.data) else: raise TypeError( - f"Unable to convert object to MAL format. `{type(getattr(annotation, 'value',annotation))}`" + f"Unable to convert object to MAL format. `{type(getattr(annotation, 'value', annotation))}`" ) @classmethod diff --git a/libs/labelbox/src/labelbox/data/serialization/ndjson/temporal.py b/libs/labelbox/src/labelbox/data/serialization/ndjson/temporal.py new file mode 100644 index 000000000..860432230 --- /dev/null +++ b/libs/labelbox/src/labelbox/data/serialization/ndjson/temporal.py @@ -0,0 +1,422 @@ +""" +Generic hierarchical classification builder for NDJSON serialization. + +This module provides reusable components for constructing nested hierarchical +classifications from temporal annotations (audio, video, etc.). + +IMPORTANT: This module ONLY supports explicit nesting via ClassificationAnswer.classifications. +Annotations must define their hierarchy structure explicitly in the annotation objects. +Temporal containment-based inference is NOT supported. +""" + +from collections import defaultdict +from typing import Any, Dict, List, Tuple, TypeVar, Generic +from pydantic import BaseModel + +from ...annotation_types.audio import AudioClassificationAnnotation + +# Generic type for temporal annotations +TemporalAnnotation = TypeVar('TemporalAnnotation', bound=Any) + + +class TemporalFrame: + """Represents a time frame in temporal annotations (audio, video, etc.).""" + + def __init__(self, start: int, end: int = None): + self.start = start + self.end = end or start + + def contains(self, other: "TemporalFrame") -> bool: + """Check if this frame contains another frame.""" + return (self.start <= other.start and + self.end is not None and other.end is not None and + self.end >= other.end) + + def strictly_contains(self, other: "TemporalFrame") -> bool: + """Check if this frame strictly contains another frame (not equal).""" + return (self.contains(other) and + (self.start < other.start or self.end > other.end)) + + def overlaps(self, other: "TemporalFrame") -> bool: + """Check if this frame overlaps with another frame.""" + return not (self.end < other.start or other.end < self.start) + + def to_dict(self) -> Dict[str, int]: + """Convert to dictionary format.""" + return {"start": self.start, "end": self.end} + + +class AnnotationGroupManager(Generic[TemporalAnnotation]): + """Manages grouping of temporal annotations by classification type. + + NOTE: Since we only support explicit nesting via ClassificationAnswer.classifications, + all top-level AudioClassificationAnnotation objects are considered roots. + """ + + def __init__(self, annotations: List[TemporalAnnotation], frame_extractor: callable): + self.annotations = annotations + self.frame_extractor = frame_extractor # Function to extract (start, end) from annotation + self.groups = self._group_annotations() + self.root_groups = set(self.groups.keys()) # All groups are roots with explicit nesting + + def _group_annotations(self) -> Dict[str, List[TemporalAnnotation]]: + """Group annotations by classification key (schema_id or name).""" + groups = defaultdict(list) + for annot in self.annotations: + key = annot.feature_schema_id or annot.name + groups[key].append(annot) + return dict(groups) + + def get_group_display_name(self, group_key: str) -> str: + """Get display name for a group.""" + group_anns = self.groups[group_key] + # Prefer the first non-empty annotation name + for ann in group_anns: + if ann.name: + return ann.name + return group_key + + +class ValueGrouper(Generic[TemporalAnnotation]): + """Handles grouping of annotations by their values and answer construction.""" + + def __init__(self, frame_extractor: callable): + self.frame_extractor = frame_extractor # Function to extract (start, end) from annotation + + def group_by_value(self, annotations: List[TemporalAnnotation]) -> List[Dict[str, Any]]: + """Group annotations by logical value and produce answer entries.""" + value_buckets = defaultdict(list) + + for ann in annotations: + key = self._get_value_key(ann) + value_buckets[key].append(ann) + + entries = [] + for _, anns in value_buckets.items(): + # Extract frames from each annotation (root frames) + frames = [self.frame_extractor(a) for a in anns] + frame_dicts = [{"start": start, "end": end} for start, end in frames] + + # Get root frames for passing to nested classifications (use first annotation's frames) + root_frames = frames[0] if frames else (None, None) + + # Pass ALL annotations so we can merge their nested classifications + entry = self._create_answer_entry(anns, frame_dicts, root_frames) + entries.append(entry) + + return entries + + def _get_value_key(self, ann: TemporalAnnotation) -> str: + """Get a stable key for grouping annotations by value.""" + if hasattr(ann.value, "answer"): + if isinstance(ann.value.answer, list): + # Checklist: stable key from selected option names + return str(sorted([opt.name for opt in ann.value.answer])) + elif hasattr(ann.value.answer, "name"): + # Radio: option name + return ann.value.answer.name + else: + # Text: the string value + return ann.value.answer + else: + return str(ann.value) + + def _get_nested_frames(self, obj: Any, parent_frames: List[Dict[str, int]], root_frames: Tuple[int, int]) -> List[Dict[str, int]]: + """Get frame range for nested classification object. + + If obj has start_frame/end_frame specified, use those. Otherwise default to root frames. + + Args: + obj: ClassificationAnswer or ClassificationAnnotation + parent_frames: Parent's frame list (for fallback) + root_frames: Root annotation's (start, end) tuple + + Returns: + List of frame dictionaries + """ + if hasattr(obj, 'start_frame') and obj.start_frame is not None and hasattr(obj, 'end_frame') and obj.end_frame is not None: + # Use explicitly specified frames + return [{"start": obj.start_frame, "end": obj.end_frame}] + else: + # Default to parent frames first, then root frames + if parent_frames: + return parent_frames + elif root_frames and root_frames[0] is not None and root_frames[1] is not None: + return [{"start": root_frames[0], "end": root_frames[1]}] + else: + return [] + + def _create_answer_entry(self, anns: List[TemporalAnnotation], frames: List[Dict[str, int]], root_frames: Tuple[int, int]) -> Dict[str, Any]: + """Create an answer entry from all annotations with the same value, merging their nested classifications. + + Args: + anns: All annotations in the value group + frames: List of frame dictionaries for this answer + root_frames: Tuple of (start, end) from the root AudioClassificationAnnotation + """ + first_ann = anns[0] + + if hasattr(first_ann.value, "answer") and isinstance(first_ann.value.answer, list): + # Checklist: emit one entry per distinct option present across ALL annotations + # First, collect all unique option names across all annotations + all_option_names = set() + for ann in anns: + if hasattr(ann.value, "answer") and isinstance(ann.value.answer, list): + for opt in ann.value.answer: + all_option_names.add(opt.name) + + entries = [] + for opt_name in sorted(all_option_names): # Sort for consistent ordering + # For each unique option, collect frames and nested classifications from all annotations + opt_frames = [] + all_nested = [] + for ann in anns: + if hasattr(ann.value, "answer") and isinstance(ann.value.answer, list): + for ann_opt in ann.value.answer: + if ann_opt.name == opt_name: + # Get this annotation's root frame range + ann_start, ann_end = self.frame_extractor(ann) + ann_frame_dict = [{"start": ann_start, "end": ann_end}] + # Collect this option's frame range (from option or parent annotation) + frames_for_this_opt = self._get_nested_frames(ann_opt, ann_frame_dict, root_frames) + opt_frames.extend(frames_for_this_opt) + # Collect nested classifications + if hasattr(ann_opt, 'classifications') and ann_opt.classifications: + all_nested.extend(ann_opt.classifications) + + entry = {"name": opt_name, "frames": opt_frames} + if all_nested: + entry["classifications"] = self._serialize_explicit_classifications(all_nested, root_frames) + entries.append(entry) + return entries[0] if len(entries) == 1 else {"options": entries, "frames": frames} + elif hasattr(first_ann.value, "answer") and hasattr(first_ann.value.answer, "name"): + # Radio + opt = first_ann.value.answer + # Use the merged frames from all annotations (already passed in) + entry = {"name": opt.name, "frames": frames} + # Collect nested classifications from all annotations + all_nested = [] + for ann in anns: + if hasattr(ann.value, "answer") and hasattr(ann.value.answer, "classifications") and ann.value.answer.classifications: + all_nested.extend(ann.value.answer.classifications) + if all_nested: + entry["classifications"] = self._serialize_explicit_classifications(all_nested, root_frames) + return entry + else: + # Text - nesting is at the annotation level, not answer level + entry = {"value": first_ann.value.answer, "frames": frames} + # Collect nested classifications from all annotations + all_nested = [] + for ann in anns: + if hasattr(ann, 'classifications') and ann.classifications: + all_nested.extend(ann.classifications) + if all_nested: + entry["classifications"] = self._serialize_explicit_classifications(all_nested, root_frames) + return entry + + def _serialize_explicit_classifications(self, classifications: List[Any], root_frames: Tuple[int, int]) -> List[Dict[str, Any]]: + """Serialize explicitly nested ClassificationAnnotation objects. + + Args: + classifications: List of ClassificationAnnotation objects + root_frames: Tuple of (start, end) from root AudioClassificationAnnotation + + Returns: + List of serialized classification dictionaries + """ + result = [] + + # Group nested classifications by name + grouped = defaultdict(list) + for cls in classifications: + name = cls.feature_schema_id or cls.name + grouped[name].append(cls) + + # Serialize each group + for name, cls_list in grouped.items(): + # Get display name from first annotation + display_name = cls_list[0].name if cls_list[0].name else name + + # Create answer entries for this nested classification + # De-duplicate by answer value + seen_values = {} # value_key -> (answer_dict, nested_classifications) + for cls in cls_list: + # Get frames for this ClassificationAnnotation (from cls or root) + cls_frames = self._get_nested_frames(cls, [], root_frames) + value_key = self._get_value_key(cls) + + if hasattr(cls.value, "answer"): + if isinstance(cls.value.answer, list): + # Checklist + for opt in cls.value.answer: + # Get frames for this checklist option (from opt or cls or root) + opt_frames = self._get_nested_frames(opt, cls_frames, root_frames) + answer = {"name": opt.name, "frames": opt_frames} + # Collect nested for recursion + opt_nested = [] + if hasattr(opt, 'classifications') and opt.classifications: + opt_nested = opt.classifications + if opt_nested: + answer["classifications"] = self._serialize_explicit_classifications(opt_nested, root_frames) + # Note: Checklist options don't need de-duplication + # (they're already handled at the parent level) + if value_key not in seen_values: + seen_values[value_key] = [] + seen_values[value_key].append(answer) + elif hasattr(cls.value.answer, "name"): + # Radio - de-duplicate by name + opt = cls.value.answer + # Check if this answer has explicit frames + has_explicit_frames = (hasattr(opt, 'start_frame') and opt.start_frame is not None and + hasattr(opt, 'end_frame') and opt.end_frame is not None) + # Get frames for this radio answer (from opt or cls or root) + opt_frames = self._get_nested_frames(opt, cls_frames, root_frames) + + # Check if we've already seen this answer name + if value_key in seen_values: + # Only merge frames if both have explicit frames, or neither does + existing_has_explicit = seen_values[value_key].get("_has_explicit", False) + if has_explicit_frames and existing_has_explicit: + # Both explicit - merge + seen_values[value_key]["frames"].extend(opt_frames) + elif has_explicit_frames and not existing_has_explicit: + # Current is explicit, existing is implicit - replace with explicit + seen_values[value_key]["frames"] = opt_frames + seen_values[value_key]["_has_explicit"] = True + elif not has_explicit_frames and existing_has_explicit: + # Current is implicit, existing is explicit - keep existing (don't merge) + pass + else: + # Both implicit - merge + seen_values[value_key]["frames"].extend(opt_frames) + + # Always merge nested classifications + if hasattr(opt, 'classifications') and opt.classifications: + seen_values[value_key]["_nested"].extend(opt.classifications) + else: + answer = {"name": opt.name, "frames": opt_frames, "_nested": [], "_has_explicit": has_explicit_frames} + if hasattr(opt, 'classifications') and opt.classifications: + answer["_nested"] = list(opt.classifications) + seen_values[value_key] = answer + else: + # Text - check for annotation-level nesting + answer = {"value": cls.value.answer, "frames": cls_frames} + # Collect nested + text_nested = [] + if hasattr(cls, 'classifications') and cls.classifications: + text_nested = cls.classifications + if text_nested: + answer["classifications"] = self._serialize_explicit_classifications(text_nested, root_frames) + if value_key not in seen_values: + seen_values[value_key] = [] + seen_values[value_key].append(answer) + + # Convert seen_values to answers list + answers = [] + for value_key, value_data in seen_values.items(): + if isinstance(value_data, list): + answers.extend(value_data) + else: + # Radio case - handle nested classifications + if value_data.get("_nested"): + value_data["classifications"] = self._serialize_explicit_classifications(value_data["_nested"], root_frames) + # Clean up internal fields + value_data.pop("_nested", None) + value_data.pop("_has_explicit", None) + answers.append(value_data) + + result.append({ + "name": display_name, + "answer": answers + }) + + return result + + +class HierarchyBuilder(Generic[TemporalAnnotation]): + """Builds hierarchical nested classifications from temporal annotations. + + NOTE: This builder only handles explicit nesting via ClassificationAnswer.classifications. + All nesting must be defined in the annotation structure itself, not inferred from temporal containment. + """ + + def __init__(self, group_manager: AnnotationGroupManager[TemporalAnnotation], value_grouper: ValueGrouper[TemporalAnnotation]): + self.group_manager = group_manager + self.value_grouper = value_grouper + + def build_hierarchy(self) -> List[Dict[str, Any]]: + """Build the complete hierarchical structure. + + All nesting is handled via explicit ClassificationAnswer.classifications, + so we simply group by value and let the ValueGrouper serialize the nested structure. + """ + results = [] + + for group_key in self.group_manager.root_groups: + group_anns = self.group_manager.groups[group_key] + top_entries = self.value_grouper.group_by_value(group_anns) + + results.append({ + "name": self.group_manager.get_group_display_name(group_key), + "answer": top_entries, + }) + + return results + + +class TemporalNDJSON(BaseModel): + """NDJSON format for temporal annotations (audio, video, etc.).""" + name: str + answer: List[Dict[str, Any]] + dataRow: Dict[str, str] + + +def create_temporal_ndjson_annotations(annotations: List[TemporalAnnotation], + data_global_key: str, + frame_extractor: callable) -> List[TemporalNDJSON]: + """ + Create NDJSON temporal annotations with hierarchical structure. + + Args: + annotations: List of temporal classification annotations + data_global_key: Global key for the data row + frame_extractor: Function that extracts (start, end) from annotation + + Returns: + List of TemporalNDJSON objects + """ + if not annotations: + return [] + + group_manager = AnnotationGroupManager(annotations, frame_extractor) + value_grouper = ValueGrouper(frame_extractor) + hierarchy_builder = HierarchyBuilder(group_manager, value_grouper) + hierarchy = hierarchy_builder.build_hierarchy() + + return [ + TemporalNDJSON( + name=item["name"], + answer=item["answer"], + dataRow={"globalKey": data_global_key} + ) + for item in hierarchy + ] + + +# Audio-specific convenience function +def create_audio_ndjson_annotations(annotations: List[AudioClassificationAnnotation], + data_global_key: str) -> List[TemporalNDJSON]: + """ + Create NDJSON audio annotations with hierarchical structure. + + Args: + annotations: List of audio classification annotations + data_global_key: Global key for the data row + + Returns: + List of TemporalNDJSON objects + """ + def audio_frame_extractor(ann: AudioClassificationAnnotation) -> Tuple[int, int]: + return (ann.start_frame, ann.end_frame or ann.start_frame) + + return create_temporal_ndjson_annotations(annotations, data_global_key, audio_frame_extractor) diff --git a/libs/labelbox/tests/data/serialization/ndjson/test_audio.py b/libs/labelbox/tests/data/serialization/ndjson/test_audio.py new file mode 100644 index 000000000..038d4d526 --- /dev/null +++ b/libs/labelbox/tests/data/serialization/ndjson/test_audio.py @@ -0,0 +1,447 @@ +import labelbox.types as lb_types +from labelbox.data.serialization.ndjson.converter import NDJsonConverter + + +def test_audio_nested_text_radio_checklist_structure(): + # Purpose: verify that class-based AudioClassificationAnnotation inputs with explicit + # nesting serialize into v3-style nested NDJSON with: + # - exactly three top-level groups (text_class, radio_class, checklist_class) + # - explicit nesting via ClassificationAnnotation.classifications and ClassificationAnswer.classifications + # - nested classifications can specify their own start_frame/end_frame (subset of root) + # - correct field shapes per type (Text uses "value", Radio/Checklist use "name") + + # Build annotations using explicit nesting (NEW interface) matching exec/v3.py output shape + anns = [] + + # text_class: simple value without nesting + anns.append( + lb_types.AudioClassificationAnnotation( + frame=1000, + end_frame=1100, + name="text_class", + value=lb_types.Text(answer="A"), + ) + ) + + # text_class: value WITH explicit nested classifications + # This annotation has nested classifications at the annotation level (for Text type) + anns.append( + lb_types.AudioClassificationAnnotation( + frame=1500, + end_frame=2400, # Root frame range + name="text_class", + value=lb_types.Text(answer="text_class value"), + classifications=[ # Explicit nesting via classifications field + lb_types.ClassificationAnnotation( + name="nested_text_class", + start_frame=1600, end_frame=2000, # Nested frame range (subset of root) + value=lb_types.Text(answer="nested_text_class value"), + classifications=[ # Deeper nesting + lb_types.ClassificationAnnotation( + name="nested_text_class_2", + start_frame=1800, end_frame=2000, # Even more specific nested range + value=lb_types.Text(answer="nested_text_class_2 value") + ) + ] + ), + lb_types.ClassificationAnnotation( + name="nested_text_class", + start_frame=2001, end_frame=2400, # Different nested frame range + value=lb_types.Text(answer="nested_text_class value2") + ) + ] + ) + ) + + # Additional text_class segments + anns.append( + lb_types.AudioClassificationAnnotation( + frame=2500, + end_frame=2700, + name="text_class", + value=lb_types.Text(answer="C"), + ) + ) + anns.append( + lb_types.AudioClassificationAnnotation( + frame=2900, + end_frame=2999, + name="text_class", + value=lb_types.Text(answer="D"), + ) + ) + + # radio_class: Explicit nesting via ClassificationAnswer.classifications + # First segment with nested classifications + anns.append( + lb_types.AudioClassificationAnnotation( + frame=200, + end_frame=1500, # Root frame range + name="radio_class", + value=lb_types.Radio( + answer=lb_types.ClassificationAnswer( + name="first_radio_answer", + classifications=[ # Explicit nesting at answer level for Radio + lb_types.ClassificationAnnotation( + name="sub_radio_question", + value=lb_types.Radio( + answer=lb_types.ClassificationAnswer( + name="first_sub_radio_answer", + start_frame=1000, end_frame=1500, # Nested frame range + classifications=[ # Deeper nesting + lb_types.ClassificationAnnotation( + name="sub_radio_question_2", + value=lb_types.Radio( + answer=lb_types.ClassificationAnswer( + name="first_sub_radio_answer_2", + start_frame=1300, end_frame=1500 # Even more specific nested range + ) + ) + ) + ] + ) + ) + ), + lb_types.ClassificationAnnotation( + name="sub_radio_question", + value=lb_types.Radio( + answer=lb_types.ClassificationAnswer( + name="second_sub_radio_answer", + start_frame=2100, end_frame=2500 # Nested frame range for second segment + ) + ) + ) + ] + ) + ), + ) + ) + + # Second segment for first_radio_answer (will merge frames in output) + anns.append( + lb_types.AudioClassificationAnnotation( + frame=2000, + end_frame=2500, + name="radio_class", + value=lb_types.Radio( + answer=lb_types.ClassificationAnswer( + name="first_radio_answer", + classifications=[ + lb_types.ClassificationAnnotation( + name="sub_radio_question", + value=lb_types.Radio( + answer=lb_types.ClassificationAnswer( + name="second_sub_radio_answer" + ) + ) + ) + ] + ) + ), + ) + ) + + # radio_class: second_radio_answer without nesting + anns.append( + lb_types.AudioClassificationAnnotation( + frame=1550, + end_frame=1700, + name="radio_class", + value=lb_types.Radio( + answer=lb_types.ClassificationAnswer(name="second_radio_answer") + ), + ) + ) + anns.append( + lb_types.AudioClassificationAnnotation( + frame=2700, + end_frame=3000, + name="radio_class", + value=lb_types.Radio( + answer=lb_types.ClassificationAnswer(name="second_radio_answer") + ), + ) + ) + + # checklist_class: Explicit nesting via ClassificationAnswer.classifications + # First segment with nested checklist + anns.append( + lb_types.AudioClassificationAnnotation( + frame=300, + end_frame=800, # Root frame range (first segment) + name="checklist_class", + value=lb_types.Checklist( + answer=[ + lb_types.ClassificationAnswer( + name="first_checklist_option", + classifications=[ # Explicit nesting at answer level for Checklist + lb_types.ClassificationAnnotation( + name="nested_checklist", + value=lb_types.Checklist( + answer=[ + lb_types.ClassificationAnswer( + name="nested_option_1", + start_frame=400, end_frame=700, # Nested frame range + classifications=[ # Deeper nesting + lb_types.ClassificationAnnotation( + name="checklist_nested_text", + start_frame=500, end_frame=700, # Even more specific nested range + value=lb_types.Text(answer="checklist_nested_text value") + ) + ] + ) + ] + ) + ) + ] + ) + ] + ), + ) + ) + + # Second segment for first_checklist_option with different nested options + anns.append( + lb_types.AudioClassificationAnnotation( + frame=1200, + end_frame=1800, # Root frame range (second segment) + name="checklist_class", + value=lb_types.Checklist( + answer=[ + lb_types.ClassificationAnswer( + name="first_checklist_option", + classifications=[ + lb_types.ClassificationAnnotation( + name="nested_checklist", + value=lb_types.Checklist( + answer=[ + lb_types.ClassificationAnswer( + name="nested_option_2", + start_frame=1200, end_frame=1600 # Nested frame range + ), + lb_types.ClassificationAnswer( + name="nested_option_3", + start_frame=1400, end_frame=1800 # Nested frame range + ) + ] + ) + ) + ] + ) + ] + ), + ) + ) + + # checklist_class: other options without nesting + anns.append( + lb_types.AudioClassificationAnnotation( + frame=2200, + end_frame=2900, + name="checklist_class", + value=lb_types.Checklist( + answer=[ + lb_types.ClassificationAnswer(name="second_checklist_option") + ] + ), + ) + ) + anns.append( + lb_types.AudioClassificationAnnotation( + frame=2500, + end_frame=3500, + name="checklist_class", + value=lb_types.Checklist( + answer=[ + lb_types.ClassificationAnswer(name="third_checklist_option") + ] + ), + ) + ) + + # Serialize a single Label containing all of the above annotations + label = lb_types.Label( + data={"global_key": "audio_nested_test_key"}, annotations=anns + ) + ndjson = list(NDJsonConverter.serialize([label])) + + # Assert: exactly three top-level groups, matching v3 root objects + assert {obj["name"] for obj in ndjson} == { + "text_class", + "radio_class", + "checklist_class", + } + + # Validate text_class structure with explicit nesting and frame ranges + text_nd = next(obj for obj in ndjson if obj["name"] == "text_class") + + # Check that we have 4 text_class answers (A, text_class value, C, D) + assert len(text_nd["answer"]) == 4 + + # Find the parent answer with nested classifications + parent = next( + item + for item in text_nd["answer"] + if item.get("value") == "text_class value" + ) + assert parent["frames"] == [{"start": 1500, "end": 2400}] + + # Check explicit nested classifications + nested = parent.get("classifications", []) + assert len(nested) == 1 # One nested_text_class group + nt = nested[0] + assert nt["name"] == "nested_text_class" + + # Check nested_text_class has 2 answers with different frame ranges + assert len(nt["answer"]) == 2 + nt_ans_1 = nt["answer"][0] + assert nt_ans_1["value"] == "nested_text_class value" + assert nt_ans_1["frames"] == [{"start": 1600, "end": 2000}] # Nested frame range + + # Check nested_text_class_2 is nested under nested_text_class + nt_nested = nt_ans_1.get("classifications", []) + assert len(nt_nested) == 1 + nt2 = nt_nested[0] + assert nt2["name"] == "nested_text_class_2" + assert nt2["answer"][0]["value"] == "nested_text_class_2 value" + assert nt2["answer"][0]["frames"] == [{"start": 1800, "end": 2000}] # Even more specific nested range + + # Check second nested_text_class answer + nt_ans_2 = nt["answer"][1] + assert nt_ans_2["value"] == "nested_text_class value2" + assert nt_ans_2["frames"] == [{"start": 2001, "end": 2400}] # Different nested frame range + + # Validate radio_class structure with explicit nesting and frame ranges + radio_nd = next(obj for obj in ndjson if obj["name"] == "radio_class") + + # Check first_radio_answer + # Note: Segments with the same answer value are merged (both segments have "first_radio_answer") + first_radios = [ + a for a in radio_nd["answer"] if a["name"] == "first_radio_answer" + ] + # We get one merged answer with both frame ranges + assert len(first_radios) == 1 + first_radio = first_radios[0] + # Merged frames from both segments: [200-1500] and [2000-2500] + assert first_radio["frames"] == [{"start": 200, "end": 1500}, {"start": 2000, "end": 2500}] + + # Check explicit nested sub_radio_question + assert "classifications" in first_radio + sub_radio = next( + c + for c in first_radio["classifications"] + if c["name"] == "sub_radio_question" + ) + + # Check sub_radio_question has 2 answers with specific frame ranges + assert len(sub_radio["answer"]) == 2 + sr_first = next( + a for a in sub_radio["answer"] if a["name"] == "first_sub_radio_answer" + ) + assert sr_first["frames"] == [{"start": 1000, "end": 1500}] # Nested frame range + + # Check sub_radio_question_2 is nested under first_sub_radio_answer + assert "classifications" in sr_first + sr2 = next( + c + for c in sr_first["classifications"] + if c["name"] == "sub_radio_question_2" + ) + assert sr2["answer"][0]["name"] == "first_sub_radio_answer_2" + assert sr2["answer"][0]["frames"] == [{"start": 1300, "end": 1500}] # Even more specific nested range + + # Check second_sub_radio_answer + sr_second = next( + a for a in sub_radio["answer"] if a["name"] == "second_sub_radio_answer" + ) + # Has specific nested frame range from first segment + assert sr_second["frames"] == [{"start": 2100, "end": 2500}] + + # Validate checklist_class structure with explicit nesting and frame ranges + checklist_nd = next( + obj for obj in ndjson if obj["name"] == "checklist_class" + ) + + # Check first_checklist_option + # Note: segments with the same answer value are merged + first_opts = [ + a + for a in checklist_nd["answer"] + if a["name"] == "first_checklist_option" + ] + assert len(first_opts) == 1 + first_opt = first_opts[0] + # Merged frames from both segments: [300-800] and [1200-1800] + assert first_opt["frames"] == [{"start": 300, "end": 800}, {"start": 1200, "end": 1800}] + + # Check explicit nested_checklist + assert "classifications" in first_opt + nested_checklist = next( + c + for c in first_opt["classifications"] + if c["name"] == "nested_checklist" + ) + + # Check nested_checklist has all 3 options (nested_option_1, 2, 3) from both segments + assert len(nested_checklist["answer"]) == 3 + + # Check nested_option_1 with specific frame range + opt1 = next( + a for a in nested_checklist["answer"] if a["name"] == "nested_option_1" + ) + assert opt1["frames"] == [{"start": 400, "end": 700}] # Nested frame range + + # Check checklist_nested_text is nested under nested_option_1 + assert "classifications" in opt1 + nested_text = next( + c + for c in opt1["classifications"] + if c["name"] == "checklist_nested_text" + ) + assert nested_text["answer"][0]["value"] == "checklist_nested_text value" + assert nested_text["answer"][0]["frames"] == [{"start": 500, "end": 700}] # Even more specific nested range + + +def test_audio_top_level_only_basic(): + anns = [ + lb_types.AudioClassificationAnnotation( + frame=200, + end_frame=1500, + name="radio_class", + value=lb_types.Radio(answer=lb_types.ClassificationAnswer(name="first_radio_answer")), + ), + lb_types.AudioClassificationAnnotation( + frame=1550, + end_frame=1700, + name="radio_class", + value=lb_types.Radio(answer=lb_types.ClassificationAnswer(name="second_radio_answer")), + ), + lb_types.AudioClassificationAnnotation( + frame=1200, + end_frame=1800, + name="checklist_class", + value=lb_types.Checklist(answer=[lb_types.ClassificationAnswer(name="angry")]), + ), + ] + + label = lb_types.Label(data={"global_key": "audio_top_level_only"}, annotations=anns) + ndjson = list(NDJsonConverter.serialize([label])) + + names = {o["name"] for o in ndjson} + assert names == {"radio_class", "checklist_class"} + + radio = next(o for o in ndjson if o["name"] == "radio_class") + r_answers = sorted(radio["answer"], key=lambda x: x["frames"][0]["start"]) + assert r_answers[0]["name"] == "first_radio_answer" + assert r_answers[0]["frames"] == [{"start": 200, "end": 1500}] + assert "classifications" not in r_answers[0] + assert r_answers[1]["name"] == "second_radio_answer" + assert r_answers[1]["frames"] == [{"start": 1550, "end": 1700}] + assert "classifications" not in r_answers[1] + + checklist = next(o for o in ndjson if o["name"] == "checklist_class") + c_answers = checklist["answer"] + assert len(c_answers) == 1 + assert c_answers[0]["name"] == "angry" + assert c_answers[0]["frames"] == [{"start": 1200, "end": 1800}] + assert "classifications" not in c_answers[0]