From 56e3c7a811754061e3c8e29a00a0835619bbb32c Mon Sep 17 00:00:00 2001 From: Madhu Nunna Date: Thu, 5 Mar 2026 14:49:43 -0800 Subject: [PATCH] fix: add code sample to retrieve eval summary scores from S3 The get_evaluation_job API does not return metric scores directly. Scores are written to the output S3 bucket. Added a notebook cell showing how to locate and parse evaluationSummary.json to access custom metric scores and prompt counts after a job completes. Fixes #653 --- .../custom-metrics-model-evaluation.ipynb | 128 ++++++++++-------- 1 file changed, 70 insertions(+), 58 deletions(-) diff --git a/evaluation-observe/bedrock-eval-custom-metrics/custom-metrics-model-evaluation.ipynb b/evaluation-observe/bedrock-eval-custom-metrics/custom-metrics-model-evaluation.ipynb index c277ed2a7..3467c3841 100644 --- a/evaluation-observe/bedrock-eval-custom-metrics/custom-metrics-model-evaluation.ipynb +++ b/evaluation-observe/bedrock-eval-custom-metrics/custom-metrics-model-evaluation.ipynb @@ -180,64 +180,40 @@ "metadata": {}, "outputs": [], "source": [ - "# Create the model evaluation job\n", - "model_eval_job_name = f\"model-evaluation-custom-metrics{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}\"\n", - "\n", - "model_eval_job = bedrock_client.create_evaluation_job(\n", - " jobName=model_eval_job_name,\n", - " jobDescription=\"Evaluate model performance with custom comprehensiveness metric\",\n", - " roleArn=role_arn,\n", - " applicationType=\"ModelEvaluation\",\n", - " inferenceConfig={\n", - " \"models\": [{\n", - " \"bedrockModel\": {\n", - " \"modelIdentifier\": generator_model\n", - " }\n", - " }]\n", - " },\n", - " outputDataConfig={\n", - " \"s3Uri\": output_path\n", - " },\n", - " evaluationConfig={\n", - " \"automated\": {\n", - " \"datasetMetricConfigs\": [{\n", - " \"taskType\": \"General\",\n", - " \"dataset\": {\n", - " \"name\": \"ModelEvalDataset\",\n", - " \"datasetLocation\": {\n", - " \"s3Uri\": input_data\n", - " }\n", - " },\n", - " \"metricNames\": [\n", - " \"Builtin.Correctness\",\n", - " \"Builtin.Completeness\",\n", - " \"Builtin.Coherence\",\n", - " \"Builtin.Relevance\",\n", - " \"Builtin.FollowingInstructions\",\n", - " \"comprehensiveness\"\n", - " ]\n", - " }],\n", - " \"customMetricConfig\": {\n", - " \"customMetrics\": [\n", - " comprehensiveness_metric\n", - " ],\n", - " \"evaluatorModelConfig\": {\n", - " \"bedrockEvaluatorModels\": [{\n", - " \"modelIdentifier\": custom_metrics_evaluator_model\n", - " }]\n", - " }\n", - " },\n", - " \"evaluatorModelConfig\": {\n", - " \"bedrockEvaluatorModels\": [{\n", - " \"modelIdentifier\": evaluator_model\n", - " }]\n", - " }\n", - " }\n", - " }\n", - ")\n", + "import boto3\n", + "import json\n", + "from collections import defaultdict\n", "\n", - "print(f\"Created model evaluation job: {model_eval_job_name}\")\n", - "print(f\"Job ID: {model_eval_job['jobArn']}\")" + "# Retrieve evaluation scores from S3 output after job completes\n", + "# The output S3 URI is available in the get_evaluation_job response\n", + "output_s3_uri = response['outputDataConfig']['s3Uri']\n", + "bucket = output_s3_uri.split('/')[2]\n", + "prefix = '/'.join(output_s3_uri.split('/')[3:])\n", + "\n", + "s3_client = boto3.client('s3')\n", + "objects = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix)\n", + "\n", + "# Aggregate scores across all output jsonl files\n", + "metric_scores = defaultdict(list)\n", + "dataset_prompt_counts = defaultdict(int)\n", + "\n", + "for obj in objects.get('Contents', []):\n", + " key = obj['Key']\n", + " if not key.endswith('_output.jsonl'):\n", + " continue\n", + " dataset_name = key.split('/datasets/')[1].split('/')[0]\n", + " body = s3_client.get_object(Bucket=bucket, Key=key)['Body'].read().decode()\n", + " for line in body.strip().split('\\n'):\n", + " record = json.loads(line)\n", + " dataset_prompt_counts[dataset_name] += 1\n", + " for score in record.get('automatedEvaluationResult', {}).get('scores', []):\n", + " metric_scores[f\"{dataset_name}/{score['metricName']}\"].append(score['result'])\n", + "\n", + "# Print average scores and prompt counts per dataset/metric\n", + "for key, scores in metric_scores.items():\n", + " dataset, metric = key.split('/', 1)\n", + " avg = sum(scores) / len(scores)\n", + " print(f\"Dataset: {dataset} | Metric: {metric} | Avg Score: {avg:.4f} | Prompts: {dataset_prompt_counts[dataset]}\")" ] }, { @@ -266,6 +242,42 @@ "print(f\"Job Status: {response['status']}\")" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import boto3\n", + "import json\n", + "\n", + "# Retrieve evaluation summary scores from S3 after job completes\n", + "# The output S3 URI is available in the get_evaluation_job response\n", + "output_s3_uri = response['outputDataConfig']['s3Uri'] # e.g. s3://bucket/prefix/\n", + "bucket = output_s3_uri.split('/')[2]\n", + "prefix = '/'.join(output_s3_uri.split('/')[3:])\n", + "\n", + "s3_client = boto3.client('s3')\n", + "\n", + "# List objects to find the evaluation summary JSON\n", + "objects = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix)\n", + "summary_key = next(\n", + " obj['Key'] for obj in objects.get('Contents', [])\n", + " if obj['Key'].endswith('evaluationSummary.json')\n", + ")\n", + "\n", + "# Download and parse the evaluation summary\n", + "summary_obj = s3_client.get_object(Bucket=bucket, Key=summary_key)\n", + "summary = json.loads(summary_obj['Body'].read())\n", + "\n", + "# Print custom metric scores and prompt counts\n", + "for dataset_result in summary.get('datasetMetricResults', []):\n", + " print(f\"Dataset: {dataset_result['datasetName']}\")\n", + " print(f\" Number of prompts: {dataset_result.get('numberOfPrompts')}\")\n", + " for metric in dataset_result.get('metricResults', []):\n", + " print(f\" Metric: {metric['metricName']} | Score: {metric.get('score')}\")" + ] + }, { "cell_type": "markdown", "id": "46961e92-4bbb-436a-8929-926e99c5073a", @@ -314,4 +326,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file