[Validate] Scene-level scenario tests and custom metrics uploads (#372)

Anthony Krivonos · web-flow · commit 6779b9af7700 · 2022-11-15T14:01:11.000-05:00
* [Validate] Support uploading custom scene-level metrics

* Added scene ref id upload and other shit

* Use existing constant

* Pr comments

* Level in results upload

* Added get test

* Add missing fixtures

* Fixed tests I think hopefully finally ugh

* Make tests less flaky
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,13 @@ All notable changes to the [Nucleus Python Client](https://github.com/scaleapi/n
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.14.27](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.27) - 2022-11-04
+
+### Added
+- Support for scene-level external evaluation functions
+- Support for uploading custom scene-level metrics
+
+
 ## [0.14.26](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.26) - 2022-11-01
 
 ### Added
@@ -27,6 +34,7 @@ dataset.get_scene_from_item_ref_id(some_item['item'].reference_id)
   - `slice.type == 'object'` => list of `Annotation`/`Prediction` objects
   - `slice.type == 'scene'` => list of `Scene` objects
 
+
 ## [0.14.24](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.24) - 2022-10-19
 
 ### Fixed
diff --git a/nucleus/dataset_item.py b/nucleus/dataset_item.py
@@ -158,6 +158,7 @@ def from_json(cls, payload: dict):
 
         if BACKEND_REFERENCE_ID_KEY in payload:
             payload[REFERENCE_ID_KEY] = payload[BACKEND_REFERENCE_ID_KEY]
+
         return cls(
             image_location=image_url,
             pointcloud_location=pointcloud_url,
diff --git a/nucleus/validate/client.py b/nucleus/validate/client.py
@@ -3,7 +3,7 @@
 from nucleus.connection import Connection
 from nucleus.job import AsyncJob
 
-from .constants import EVAL_FUNCTION_KEY, SCENARIO_TEST_ID_KEY
+from .constants import EVAL_FUNCTION_KEY, SCENARIO_TEST_ID_KEY, EntityLevel
 from .data_transfer_objects.eval_function import (
     CreateEvalFunction,
     EvalFunctionEntry,
@@ -205,13 +205,15 @@ def metrics(self, model_id: str):
     def create_external_eval_function(
         self,
         name: str,
+        level: EntityLevel = EntityLevel.ITEM,
     ) -> EvalFunctionEntry:
         """Creates a new external evaluation function. This external function can be used to upload evaluation
         results with functions defined and computed by the customer, without having to share the source code of the
         respective function.
 
         Args:
             name: unique name of evaluation function
+            level: level at which the eval function is run, defaults to "item"
 
         Raises:
             - NucleusAPIError if the creation of the function fails on the server side
@@ -228,6 +230,7 @@ def create_external_eval_function(
                 is_external_function=True,
                 serialized_fn=None,
                 raw_source=None,
+                level=level,
             ).dict(),
             "validate/eval_fn",
         )
diff --git a/nucleus/validate/constants.py b/nucleus/validate/constants.py
@@ -20,3 +20,10 @@ class ThresholdComparison(str, Enum):
     GREATER_THAN_EQUAL_TO = "greater_than_equal_to"
     LESS_THAN = "less_than"
     LESS_THAN_EQUAL_TO = "less_than_equal_to"
+
+
+class EntityLevel(str, Enum):
+    """Level for evaluation functions and unit tests."""
+
+    ITEM = "item"
+    SCENE = "scene"
diff --git a/nucleus/validate/data_transfer_objects/eval_function.py b/nucleus/validate/data_transfer_objects/eval_function.py
@@ -91,6 +91,7 @@ class CreateEvalFunction(ImmutableModel):
     is_external_function: bool
     serialized_fn: Optional[str] = None
     raw_source: Optional[str] = None
+    level: Optional[str] = None
 
     @validator("name")
     def name_is_valid(cls, v):  # pylint: disable=no-self-argument
diff --git a/nucleus/validate/data_transfer_objects/scenario_test_evaluations.py b/nucleus/validate/data_transfer_objects/scenario_test_evaluations.py
@@ -1,15 +1,32 @@
-from typing import List
+from typing import Optional
 
-from pydantic import validator
+from pydantic import root_validator, validator
 
 from nucleus.pydantic_base import ImmutableModel
 
 
 class EvaluationResult(ImmutableModel):
-    item_ref_id: str
-    score: float
+    item_ref_id: Optional[str] = None
+    scene_ref_id: Optional[str] = None
+    score: float = 0
     weight: float = 1
 
+    @root_validator()
+    def is_item_or_scene_provided(
+        cls, values
+    ):  # pylint: disable=no-self-argument
+        if (
+            values.get("item_ref_id") is None
+            and values.get("scene_ref_id") is None
+        ) or (
+            (
+                values.get("item_ref_id") is not None
+                and values.get("scene_ref_id") is not None
+            )
+        ):
+            raise ValueError("Must provide either item_ref_id or scene_ref_id")
+        return values
+
     @validator("score", "weight")
     def is_normalized(cls, v):  # pylint: disable=no-self-argument
         if 0 <= v <= 1:
diff --git a/nucleus/validate/scenario_test.py b/nucleus/validate/scenario_test.py
@@ -5,17 +5,19 @@
 and have confidence that they’re always shipping the best model.
 """
 from dataclasses import dataclass, field
-from typing import List, Optional
+from typing import List, Optional, Union
 
 from ..connection import Connection
-from ..constants import DATASET_ITEMS_KEY, NAME_KEY, SLICE_ID_KEY
+from ..constants import DATASET_ITEMS_KEY, NAME_KEY, SCENES_KEY, SLICE_ID_KEY
 from ..dataset_item import DatasetItem
+from ..scene import Scene
 from .constants import (
     EVAL_FUNCTION_ID_KEY,
     SCENARIO_TEST_ID_KEY,
     SCENARIO_TEST_METRICS_KEY,
     THRESHOLD_COMPARISON_KEY,
     THRESHOLD_KEY,
+    EntityLevel,
     ThresholdComparison,
 )
 from .data_transfer_objects.scenario_test_evaluations import EvaluationResult
@@ -162,16 +164,31 @@ def get_eval_history(self) -> List[ScenarioTestEvaluation]:
         ]
         return evaluations
 
-    def get_items(self) -> List[DatasetItem]:
+    def get_items(
+        self, level: EntityLevel = EntityLevel.ITEM
+    ) -> Union[List[DatasetItem], List[Scene]]:
+        """Gets items within a scenario test at a given level, returning a list of DatasetItem or Scene objects.
+
+        Args:
+            level: :class:`EntityLevel`
+
+        Returns:
+            A list of :class:`ScenarioTestEvaluation` objects.
+        """
         response = self.connection.get(
             f"validate/scenario_test/{self.id}/items",
         )
+        if level == EntityLevel.SCENE:
+            return [
+                Scene.from_json(scene, skip_validate=True)
+                for scene in response[SCENES_KEY]
+            ]
         return [
             DatasetItem.from_json(item) for item in response[DATASET_ITEMS_KEY]
         ]
 
     def set_baseline_model(self, model_id: str):
-        """Set's a new baseline model for the ScenarioTest.  In order to be eligible to be a baseline,
+        """Sets a new baseline model for the ScenarioTest.  In order to be eligible to be a baseline,
         this scenario test must have been evaluated using that model.  The baseline model's performance
         is used as the threshold for all metrics against which other models are compared.
 
@@ -205,14 +222,28 @@ def upload_external_evaluation_results(
             len(results) > 0
         ), "Submitting evaluation requires at least one result."
 
+        level = EntityLevel.ITEM
         metric_per_ref_id = {}
         weight_per_ref_id = {}
         aggregate_weighted_sum = 0.0
         aggregate_weight = 0.0
+
         # aggregation based on https://en.wikipedia.org/wiki/Weighted_arithmetic_mean
         for r in results:
-            metric_per_ref_id[r.item_ref_id] = r.score
-            weight_per_ref_id[r.item_ref_id] = r.weight
+            # Ensure results are uploaded ONLY for items or ONLY for scenes
+            if r.scene_ref_id is not None:
+                level = EntityLevel.SCENE
+            if r.item_ref_id is not None and level == EntityLevel.SCENE:
+                raise ValueError(
+                    "All evaluation results must either pertain to a scene_ref_id or an item_ref_id, not both."
+                )
+            ref_id = (
+                r.item_ref_id if level == EntityLevel.ITEM else r.scene_ref_id
+            )
+
+            # Aggregate scores and weights
+            metric_per_ref_id[ref_id] = r.score
+            weight_per_ref_id[ref_id] = r.weight
             aggregate_weighted_sum += r.score * r.weight
             aggregate_weight += r.weight
 
@@ -224,6 +255,7 @@ def upload_external_evaluation_results(
             "overall_metric": aggregate_weighted_sum / aggregate_weight,
             "model_id": model_id,
             "slice_id": self.slice_id,
+            "level": level.value,
         }
         response = self.connection.post(
             payload,
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,7 +21,7 @@ exclude = '''
 
 [tool.poetry]
 name = "scale-nucleus"
-version = "0.14.26"
+version = "0.14.27"
 description = "The official Python client library for Nucleus, the Data Platform for AI"
 license =  "MIT"
 authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]
diff --git a/tests/cli/conftest.py b/tests/cli/conftest.py
@@ -25,6 +25,17 @@ def module_scope_datasets(CLIENT):
     yield test_datasets
 
 
+@pytest.fixture(scope="module")
+def module_scope_scene_datasets(CLIENT):
+    test_scene_datasets = []
+    for i in range(3):
+        dataset_name = f"[PyTest] CLI {i} {get_uuid()} (Scene)"
+        test_scene_datasets.append(
+            CLIENT.create_dataset(dataset_name, is_scene=True)
+        )
+    yield test_scene_datasets
+
+
 @pytest.fixture(scope="function")
 def function_scope_dataset(CLIENT):
     dataset = CLIENT.create_dataset(f"[PyTest] Dataset {get_uuid()}")
@@ -49,6 +60,11 @@ def populated_dataset(module_scope_datasets):
     yield module_scope_datasets[0]
 
 
+@pytest.fixture(scope="module")
+def populated_scene_dataset(module_scope_scene_datasets):
+    yield module_scope_scene_datasets[0]
+
+
 @pytest.fixture(scope="module")
 def model(module_scope_models):
     yield module_scope_models[0]
@@ -76,6 +92,28 @@ def test_slice(populated_dataset, slice_items):
     yield slc
 
 
+@pytest.fixture(scope="module")
+def scenes(populated_dataset):
+    items = make_dataset_items()
+    populated_dataset.append(items)
+    yield items
+
+
+@pytest.fixture(scope="module")
+def slice_scenes(scenes):
+    yield scenes[:2]
+
+
+@pytest.fixture(scope="module")
+def test_scene_slice(populated_scene_dataset, slice_scenes):
+    slice_name = "[PyTest] CLI Scene Slice"
+    slc = populated_scene_dataset.create_slice(
+        name=slice_name,
+        reference_ids=[scene.reference_id for scene in slice_scenes],
+    )
+    yield slc
+
+
 @pytest.fixture(scope="module")
 def annotations(populated_dataset, slice_items):
     annotations = create_box_annotations(populated_dataset, slice_items)
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -28,6 +28,7 @@
 )
 from nucleus.errors import NucleusAPIError
 from nucleus.job import AsyncJob, JobError
+from nucleus.scene import LidarScene, VideoScene
 
 from .helpers import (
     DATASET_WITH_EMBEDDINGS,
@@ -36,9 +37,11 @@
     TEST_CATEGORY_ANNOTATIONS,
     TEST_DATASET_NAME,
     TEST_IMG_URLS,
+    TEST_LIDAR_SCENES,
     TEST_MULTICATEGORY_ANNOTATIONS,
     TEST_POLYGON_ANNOTATIONS,
     TEST_SEGMENTATION_ANNOTATIONS,
+    TEST_VIDEO_SCENES,
     assert_partial_equality,
     reference_id_from_url,
 )
@@ -94,6 +97,10 @@ def make_dataset_items():
     return ds_items_with_metadata
 
 
+def make_scenes():
+    return [VideoScene.from_json(s) for s in TEST_VIDEO_SCENES["scenes"]]
+
+
 def test_dataset_create_and_delete_no_scene(CLIENT):
     # Creation
     ds = CLIENT.create_dataset(TEST_DATASET_NAME)
diff --git a/tests/validate/conftest.py b/tests/validate/conftest.py
@@ -9,7 +9,7 @@
     create_predictions,
     get_uuid,
 )
-from tests.test_dataset import make_dataset_items
+from tests.test_dataset import make_dataset_items, make_scenes
 
 
 @pytest.fixture(scope="module")
@@ -40,6 +40,56 @@ def test_slice(validate_dataset, slice_items):
     yield slc
 
 
+@pytest.fixture(scope="module")
+def module_scope_datasets(CLIENT):
+    test_datasets = []
+    for i in range(3):
+        dataset_name = f"[PyTest] CLI {i} {get_uuid()}"
+        test_datasets.append(
+            CLIENT.create_dataset(dataset_name, is_scene=False)
+        )
+    yield test_datasets
+
+
+@pytest.fixture(scope="module")
+def module_scope_scene_datasets(CLIENT):
+    test_scene_datasets = []
+    for i in range(3):
+        dataset_name = f"[PyTest] CLI {i} {get_uuid()} (Scene)"
+        test_scene_datasets.append(
+            CLIENT.create_dataset(dataset_name, is_scene=True)
+        )
+    yield test_scene_datasets
+
+
+@pytest.fixture(scope="module")
+def populated_scene_dataset(module_scope_scene_datasets):
+    yield module_scope_scene_datasets[0]
+
+
+@pytest.fixture(scope="module")
+def slice_scenes():
+    scenes = make_scenes()[:1]
+    yield scenes
+
+
+@pytest.fixture(scope="module")
+def scenes(populated_scene_dataset, slice_scenes):
+    job = populated_scene_dataset.append(slice_scenes, asynchronous=True)
+    job.sleep_until_complete()
+    yield slice_scenes
+
+
+@pytest.fixture(scope="module")
+def test_scene_slice(populated_scene_dataset, scenes):
+    slice_name = "[PyTest] CLI Scene Slice"
+    slc = populated_scene_dataset.create_slice(
+        name=slice_name,
+        reference_ids=[scene.reference_id for scene in scenes],
+    )
+    yield slc
+
+
 @pytest.fixture(scope="module")
 def model(CLIENT):
     model_reference = "model_" + str(time.time())
diff --git a/tests/validate/test_scenario_test.py b/tests/validate/test_scenario_test.py