[Validate] Provide endpoint for uploading custom evaluation results (#306)

jean-lucas · web-flow · commit b9c1a243c07f · 2022-06-02T16:20:43.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,7 +15,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
-- Allow users to create placeholder evaluation functions for Scenario Tests in Validate
+- Allow users to create external evaluation functions for Scenario Tests in Validate.
+- Allow users to upload external evaluation results calculated on the client side.
 
 
 ## [0.11.2](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.11.2) - 2022-05-20
diff --git a/nucleus/validate/__init__.py b/nucleus/validate/__init__.py
@@ -14,6 +14,7 @@
     GetEvalFunctions,
 )
 from .data_transfer_objects.scenario_test import CreateScenarioTestRequest
+from .data_transfer_objects.scenario_test_evaluations import EvaluationResult
 from .errors import CreateScenarioTestError
 from .eval_functions.available_eval_functions import AvailableEvalFunctions
 from .scenario_test import ScenarioTest
diff --git a/nucleus/validate/client.py b/nucleus/validate/client.py
@@ -85,6 +85,15 @@ def create_scenario_test(
                 "evaluation_functions=[client.validate.eval_functions.bbox_iou()]"
             )
 
+        external_fns = [
+            f.eval_func_entry.is_external_function
+            for f in evaluation_functions
+        ]
+        if any(external_fns):
+            assert all(
+                external_fns
+            ), "Cannot create scenario tests with mixed placeholder and non-placeholder evaluation functions"
+
         response = self.connection.post(
             CreateScenarioTestRequest(
                 name=name,
diff --git a/nucleus/validate/data_transfer_objects/scenario_test_evaluations.py b/nucleus/validate/data_transfer_objects/scenario_test_evaluations.py
@@ -1,5 +1,7 @@
 from typing import List
 
+from pydantic import validator
+
 from nucleus.pydantic_base import ImmutableModel
 
 
@@ -9,3 +11,17 @@ class EvalDetail(ImmutableModel):
 
 class GetEvalHistory(ImmutableModel):
     evaluations: List[EvalDetail]
+
+
+class EvaluationResult(ImmutableModel):
+    item_ref_id: str
+    score: float
+    weight: float = 1
+
+    @validator("score", "weight")
+    def is_normalized(cls, v):  # pylint: disable=no-self-argument
+        if 0 <= v <= 1:
+            return v
+        raise ValueError(
+            f"Expected evaluation score and weights to be normalized between 0 and 1, but got: {v}"
+        )
diff --git a/nucleus/validate/eval_functions/available_eval_functions.py b/nucleus/validate/eval_functions/available_eval_functions.py
@@ -1303,7 +1303,7 @@ def __repr__(self):
         return (
             f"<AvailableEvaluationFunctions: public: {functions_lower} "
             f"private: {list(self._custom_to_function.keys())} "
-            f"external: {list(self._external_to_function.keys())}   "
+            f"external: {list(self._external_to_function.keys())}"
         )
 
     @property
diff --git a/nucleus/validate/scenario_test.py b/nucleus/validate/scenario_test.py
@@ -18,9 +18,15 @@
     THRESHOLD_KEY,
     ThresholdComparison,
 )
-from .data_transfer_objects.scenario_test_evaluations import GetEvalHistory
+from .data_transfer_objects.scenario_test_evaluations import (
+    EvaluationResult,
+    GetEvalHistory,
+)
 from .data_transfer_objects.scenario_test_metric import AddScenarioTestFunction
-from .eval_functions.available_eval_functions import EvalFunction
+from .eval_functions.available_eval_functions import (
+    EvalFunction,
+    ExternalEvalFunction,
+)
 from .scenario_test_evaluation import ScenarioTestEvaluation
 from .scenario_test_metric import ScenarioTestMetric
 
@@ -83,9 +89,13 @@ def add_eval_function(
         Args:
             eval_function: :class:`EvalFunction`
 
+        Raises:
+            NucleusAPIError: By adding this function, the scenario test mixes external with non-external functions which is not permitted.
+
         Returns:
             The created ScenarioTestMetric object.
         """
+
         response = self.connection.post(
             AddScenarioTestFunction(
                 scenario_test_name=self.name,
@@ -174,3 +184,43 @@ def set_baseline_model(self, model_id: str):
         )
         self.baseline_model_id = response.get("baseline_model_id")
         return self.baseline_model_id
+
+    def upload_external_evaluation_results(
+        self,
+        eval_fn: ExternalEvalFunction,
+        results: List[EvaluationResult],
+        model_id: str,
+    ):
+        assert (
+            eval_fn.eval_func_entry.is_external_function
+        ), "Submitting evaluation results is only available for external functions."
+
+        assert (
+            len(results) > 0
+        ), "Submitting evaluation requires at least one result."
+
+        metric_per_ref_id = {}
+        weight_per_ref_id = {}
+        aggregate_weighted_sum = 0.0
+        aggregate_weight = 0.0
+        # aggregation based on https://en.wikipedia.org/wiki/Weighted_arithmetic_mean
+        for r in results:
+            metric_per_ref_id[r.item_ref_id] = r.score
+            weight_per_ref_id[r.item_ref_id] = r.weight
+            aggregate_weighted_sum += r.score * r.weight
+            aggregate_weight += r.weight
+
+        payload = {
+            "unit_test_id": self.id,
+            "eval_function_id": eval_fn.id,
+            "result_per_ref_id": metric_per_ref_id,
+            "weight_per_ref_id": weight_per_ref_id,
+            "overall_metric": aggregate_weighted_sum / aggregate_weight,
+            "model_id": model_id,
+            "slice_id": self.slice_id,
+        }
+        response = self.connection.post(
+            payload,
+            "validate/scenario_test/upload_results",
+        )
+        return response

Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,7 @@`
`14`	`14`	`GetEvalFunctions,`
`15`	`15`	`)`
`16`	`16`	`from .data_transfer_objects.scenario_test import CreateScenarioTestRequest`
	`17`	`+from .data_transfer_objects.scenario_test_evaluations import EvaluationResult`
`17`	`18`	`from .errors import CreateScenarioTestError`
`18`	`19`	`from .eval_functions.available_eval_functions import AvailableEvalFunctions`
`19`	`20`	`from .scenario_test import ScenarioTest`
Original file line number	Diff line number	Diff line change
`@@ -1303,7 +1303,7 @@ def __repr__(self):`
`1303`	`1303`	`return (`
`1304`	`1304`	`f"<AvailableEvaluationFunctions: public: {functions_lower} "`
`1305`	`1305`	`f"private: {list(self._custom_to_function.keys())} "`
`1306`		`- f"external: {list(self._external_to_function.keys())} "`
	`1306`	`+ f"external: {list(self._external_to_function.keys())}"`
`1307`	`1307`	`)`
`1308`	`1308`
`1309`	`1309`	`@property`