From 56e3c7a811754061e3c8e29a00a0835619bbb32c Mon Sep 17 00:00:00 2001
From: Madhu Nunna <madnunna@amazon.com>
Date: Thu, 5 Mar 2026 14:49:43 -0800
Subject: [PATCH] fix: add code sample to retrieve eval summary scores from S3

The get_evaluation_job API does not return metric scores directly.
Scores are written to the output S3 bucket. Added a notebook cell
showing how to locate and parse evaluationSummary.json to access
custom metric scores and prompt counts after a job completes.

Fixes #653
---
 .../custom-metrics-model-evaluation.ipynb     | 128 ++++++++++--------
 1 file changed, 70 insertions(+), 58 deletions(-)

diff --git a/evaluation-observe/bedrock-eval-custom-metrics/custom-metrics-model-evaluation.ipynb b/evaluation-observe/bedrock-eval-custom-metrics/custom-metrics-model-evaluation.ipynb
index c277ed2a7..3467c3841 100644
--- a/evaluation-observe/bedrock-eval-custom-metrics/custom-metrics-model-evaluation.ipynb
+++ b/evaluation-observe/bedrock-eval-custom-metrics/custom-metrics-model-evaluation.ipynb
@@ -180,64 +180,40 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Create the model evaluation job\n",
-    "model_eval_job_name = f\"model-evaluation-custom-metrics{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}\"\n",
-    "\n",
-    "model_eval_job = bedrock_client.create_evaluation_job(\n",
-    "    jobName=model_eval_job_name,\n",
-    "    jobDescription=\"Evaluate model performance with custom comprehensiveness metric\",\n",
-    "    roleArn=role_arn,\n",
-    "    applicationType=\"ModelEvaluation\",\n",
-    "    inferenceConfig={\n",
-    "        \"models\": [{\n",
-    "            \"bedrockModel\": {\n",
-    "                \"modelIdentifier\": generator_model\n",
-    "            }\n",
-    "        }]\n",
-    "    },\n",
-    "    outputDataConfig={\n",
-    "        \"s3Uri\": output_path\n",
-    "    },\n",
-    "    evaluationConfig={\n",
-    "        \"automated\": {\n",
-    "            \"datasetMetricConfigs\": [{\n",
-    "                \"taskType\": \"General\",\n",
-    "                \"dataset\": {\n",
-    "                    \"name\": \"ModelEvalDataset\",\n",
-    "                    \"datasetLocation\": {\n",
-    "                        \"s3Uri\": input_data\n",
-    "                    }\n",
-    "                },\n",
-    "                \"metricNames\": [\n",
-    "                    \"Builtin.Correctness\",\n",
-    "                    \"Builtin.Completeness\",\n",
-    "                    \"Builtin.Coherence\",\n",
-    "                    \"Builtin.Relevance\",\n",
-    "                    \"Builtin.FollowingInstructions\",\n",
-    "                    \"comprehensiveness\"\n",
-    "                ]\n",
-    "            }],\n",
-    "            \"customMetricConfig\": {\n",
-    "                \"customMetrics\": [\n",
-    "                    comprehensiveness_metric\n",
-    "                ],\n",
-    "                \"evaluatorModelConfig\": {\n",
-    "                    \"bedrockEvaluatorModels\": [{\n",
-    "                        \"modelIdentifier\": custom_metrics_evaluator_model\n",
-    "                    }]\n",
-    "                }\n",
-    "            },\n",
-    "            \"evaluatorModelConfig\": {\n",
-    "                \"bedrockEvaluatorModels\": [{\n",
-    "                    \"modelIdentifier\": evaluator_model\n",
-    "                }]\n",
-    "            }\n",
-    "        }\n",
-    "    }\n",
-    ")\n",
+    "import boto3\n",
+    "import json\n",
+    "from collections import defaultdict\n",
     "\n",
-    "print(f\"Created model evaluation job: {model_eval_job_name}\")\n",
-    "print(f\"Job ID: {model_eval_job['jobArn']}\")"
+    "# Retrieve evaluation scores from S3 output after job completes\n",
+    "# The output S3 URI is available in the get_evaluation_job response\n",
+    "output_s3_uri = response['outputDataConfig']['s3Uri']\n",
+    "bucket = output_s3_uri.split('/')[2]\n",
+    "prefix = '/'.join(output_s3_uri.split('/')[3:])\n",
+    "\n",
+    "s3_client = boto3.client('s3')\n",
+    "objects = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix)\n",
+    "\n",
+    "# Aggregate scores across all output jsonl files\n",
+    "metric_scores = defaultdict(list)\n",
+    "dataset_prompt_counts = defaultdict(int)\n",
+    "\n",
+    "for obj in objects.get('Contents', []):\n",
+    "    key = obj['Key']\n",
+    "    if not key.endswith('_output.jsonl'):\n",
+    "        continue\n",
+    "    dataset_name = key.split('/datasets/')[1].split('/')[0]\n",
+    "    body = s3_client.get_object(Bucket=bucket, Key=key)['Body'].read().decode()\n",
+    "    for line in body.strip().split('\\n'):\n",
+    "        record = json.loads(line)\n",
+    "        dataset_prompt_counts[dataset_name] += 1\n",
+    "        for score in record.get('automatedEvaluationResult', {}).get('scores', []):\n",
+    "            metric_scores[f\"{dataset_name}/{score['metricName']}\"].append(score['result'])\n",
+    "\n",
+    "# Print average scores and prompt counts per dataset/metric\n",
+    "for key, scores in metric_scores.items():\n",
+    "    dataset, metric = key.split('/', 1)\n",
+    "    avg = sum(scores) / len(scores)\n",
+    "    print(f\"Dataset: {dataset} | Metric: {metric} | Avg Score: {avg:.4f} | Prompts: {dataset_prompt_counts[dataset]}\")"
    ]
   },
   {
@@ -266,6 +242,42 @@
     "print(f\"Job Status: {response['status']}\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import boto3\n",
+    "import json\n",
+    "\n",
+    "# Retrieve evaluation summary scores from S3 after job completes\n",
+    "# The output S3 URI is available in the get_evaluation_job response\n",
+    "output_s3_uri = response['outputDataConfig']['s3Uri']  # e.g. s3://bucket/prefix/\n",
+    "bucket = output_s3_uri.split('/')[2]\n",
+    "prefix = '/'.join(output_s3_uri.split('/')[3:])\n",
+    "\n",
+    "s3_client = boto3.client('s3')\n",
+    "\n",
+    "# List objects to find the evaluation summary JSON\n",
+    "objects = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix)\n",
+    "summary_key = next(\n",
+    "    obj['Key'] for obj in objects.get('Contents', [])\n",
+    "    if obj['Key'].endswith('evaluationSummary.json')\n",
+    ")\n",
+    "\n",
+    "# Download and parse the evaluation summary\n",
+    "summary_obj = s3_client.get_object(Bucket=bucket, Key=summary_key)\n",
+    "summary = json.loads(summary_obj['Body'].read())\n",
+    "\n",
+    "# Print custom metric scores and prompt counts\n",
+    "for dataset_result in summary.get('datasetMetricResults', []):\n",
+    "    print(f\"Dataset: {dataset_result['datasetName']}\")\n",
+    "    print(f\"  Number of prompts: {dataset_result.get('numberOfPrompts')}\")\n",
+    "    for metric in dataset_result.get('metricResults', []):\n",
+    "        print(f\"  Metric: {metric['metricName']} | Score: {metric.get('score')}\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "46961e92-4bbb-436a-8929-926e99c5073a",
@@ -314,4 +326,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file