diff --git a/migrations/360-eval/default-config/judge_profiles.jsonl b/migrations/360-eval/default-config/judge_profiles.jsonl
index f74d0d835..74f208526 100644
--- a/migrations/360-eval/default-config/judge_profiles.jsonl
+++ b/migrations/360-eval/default-config/judge_profiles.jsonl
@@ -1,3 +1,3 @@
-{"model_id": "bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0", "region": "us-west-2", "input_cost_per_1k": 0.003, "output_cost_per_1k": 0.015}
-{"model_id": "bedrock/us.meta.llama4-maverick-17b-instruct-v1:0", "region": "us-east-2", "input_cost_per_1k": 0.00024, "output_cost_per_1k": 0.00097}
-{"model_id": "bedrock/us.amazon.nova-premier-v1:0", "region": "us-east-2", "input_cost_per_1k": 0.0025, "output_cost_per_1k": 0.0125}
\ No newline at end of file
+{"model_id": "bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0", "region": "us-west-2", "input_cost_per_1k": 0.003, "output_cost_per_1k": 0.015}
+{"model_id": "bedrock/us.amazon.nova-premier-v1:0", "region": "us-east-2", "input_cost_per_1k": 0.0025, "output_cost_per_1k": 0.0125}
+{"model_id": "bedrock/qwen.qwen3-coder-480b-a35b-v1:0", "region": "us-west-2", "input_cost_per_1k": 0.00022, "output_cost_per_1k": 0.0018}
diff --git a/migrations/360-eval/default-config/models_profiles.jsonl b/migrations/360-eval/default-config/models_profiles.jsonl
index ec4f15888..669727475 100644
--- a/migrations/360-eval/default-config/models_profiles.jsonl
+++ b/migrations/360-eval/default-config/models_profiles.jsonl
@@ -1,8 +1,12 @@
{"model_id":"bedrock/us.amazon.nova-pro-v1:0", "region": "us-west-2","input_token_cost": 0.0008, "output_token_cost": 0.0032}
{"model_id":"bedrock/us.amazon.nova-micro-v1:0", "region": "us-west-2","input_token_cost": 0.000035, "output_token_cost": 0.00014}
{"model_id":"bedrock/us.amazon.nova-lite-v1:0", "region": "us-west-2","input_token_cost": 0.00006, "output_token_cost": 0.00024}
-{"model_id":"bedrock/us.anthropic.claude-3-5-haiku-20241022-v1:0", "region": "us-east-1","input_token_cost": 0.0008, "output_token_cost": 0.004}
-{"model_id":"bedrock/us.anthropic.claude-3-5-sonnet-20241022-v2:0","region": "us-east-1", "input_token_cost": 0.003, "output_token_cost": 0.015}
{"model_id":"bedrock/us.meta.llama3-3-70b-instruct-v1:0", "region": "us-east-2", "input_token_cost": 0.00072, "output_token_cost": 0.00072}
{"model_id":"bedrock/us.mistral.mixtral-8x7b-instruct-v0:1", "region": "us-east-2", "input_token_cost": 0.00045, "output_token_cost": 0.0007}
{"model_id":"bedrock/us.mistral.pixtral-large-2502-v1:0", "region": "us-east-1", "input_token_cost": 0.002, "output_token_cost": 0.006}
+{"model_id":"bedrock/openai.gpt-oss-120b-1:0", "region": "us-west-2", "input_token_cost": 0.00015, "output_token_cost": 0.0006}
+{"model_id":"bedrock/openai.gpt-oss-20b-1:0", "region": "us-west-2", "input_token_cost": 0.00007, "output_token_cost": 0.0003}
+{"model_id":"bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0", "region": "us-west-2", "input_token_cost": 0.003, "output_token_cost": 0.015}
+{"model_id":"bedrock/deepseek.v3-v1:0", "region": "us-west-2", "input_token_cost": 0.00058, "output_token_cost": 0.00168}
+{"model_id":"bedrock/qwen.qwen3-coder-30b-a3b-v1:0", "region": "us-west-2", "input_token_cost": 0.00015, "output_token_cost": 0.0006}
+
diff --git a/migrations/360-eval/src/benchmarks_run.py b/migrations/360-eval/src/benchmarks_run.py
index 5817581d3..410f59a12 100644
--- a/migrations/360-eval/src/benchmarks_run.py
+++ b/migrations/360-eval/src/benchmarks_run.py
@@ -50,7 +50,7 @@ def evaluate_with_llm_judge(judge_model_id,
model_response,
golden_answer)
- cfg = {"maxTokens": 1500, "temperature": 0.3, "topP": 0.9, "aws_region_name": judge_region}
+ cfg = {"maxTokens": 1500, "topP": 0.9, "aws_region_name": judge_region}
try:
resp = run_inference(model_name=judge_model_id,
prompt_text=eval_template,
@@ -105,8 +105,11 @@ def evaluate_with_judges(judges,
for j in judges:
try:
logging.debug(f"Evaluating with judge model {j['model_id']}")
+ model_identification = j["model_id"]
+ if "bedrock" in j["model_id"]:
+ model_identification = model_identification.replace("bedrock", "bedrock/converse")
r = evaluate_with_llm_judge(
- judge_model_id=j["model_id"],
+ judge_model_id=model_identification,
judge_region=j["region"],
prompt=prompt,
model_response=model_response,
@@ -193,7 +196,8 @@ def benchmark(
params['api_key'] = os.getenv('AZURE_API_KEY')
elif "bedrock" in model_id:
params['aws_region_name'] = region
- model_id = model_id.replace("bedrock", "bedrock/converse")
+ if 'converse' not in model_id:
+ model_id = model_id.replace("bedrock", "bedrock/converse")
elif 'openai/' in model_id:
params['api_key'] = os.getenv('OPENAI_API')
else:
@@ -415,7 +419,8 @@ def check_single_model(model):
params['api_key'] = os.getenv('GOOGLE_API')
elif 'azure' in model_id:
params['api_key'] = os.getenv('AZURE_API_KEY')
- model_id = model_id.replace("bedrock", "bedrock/converse")
+ elif 'bedrock' in model_id and 'converse' not in model_id:
+ model_id = model_id.replace("bedrock", "bedrock/converse")
elif 'openai/' in model_id:
params['api_key'] = os.getenv('OPENAI_API')
else:
@@ -440,7 +445,7 @@ def check_single_model(model):
executor.submit(check_single_model, model): model
for model in models
}
-
+
completed = 0
total = len(models)
@@ -589,7 +594,7 @@ def main(
"task_types": js["task"]["task_type"],
"task_criteria": js["task"]["task_criteria"],
"golden_answer": js.get("golden_answer", ""),
- "configured_output_tokens_for_request": js.get("expected_output_tokens", 5000),
+ "configured_output_tokens_for_request": js.get("expected_output_tokens", 4500),
"region": js.get("region", "us-east-1"),
"temperature": js.get("temperature", 0.7),
"user_defined_metrics": js.get("user_defined_metrics", ""),
diff --git a/migrations/360-eval/src/utils.py b/migrations/360-eval/src/utils.py
index 9bee7805d..2bac37300 100644
--- a/migrations/360-eval/src/utils.py
+++ b/migrations/360-eval/src/utils.py
@@ -7,6 +7,7 @@
import random
import logging
import base64
+import litellm
import requests
import requests.exceptions
from tenacity import retry, stop_after_delay, wait_exponential, retry_if_exception_type
@@ -19,7 +20,7 @@
logger = logging.getLogger(__name__)
-
+litellm.drop_params = True
# ----------------------------------------
# Request Builders
diff --git a/migrations/360-eval/src/visualize_results.py b/migrations/360-eval/src/visualize_results.py
index b4d48d530..893d72cc5 100644
--- a/migrations/360-eval/src/visualize_results.py
+++ b/migrations/360-eval/src/visualize_results.py
@@ -38,7 +38,7 @@
# Performance thresholds
PERFORMANCE_THRESHOLDS = {
'success_rate': {'good': 0.95, 'medium': 0.85},
- 'avg_latency': {'good': 0.6, 'medium': 1.5},
+ # 'avg_latency': {'good': 1.5, 'medium': 2},
'avg_cost': {'good': 0.5, 'medium': 1.0},
'avg_otps': {'good': 100, 'medium': 35},
}
@@ -1108,6 +1108,37 @@ def extract_judge_scores(json_str):
return {}
+from collections import defaultdict
+import numpy as np
+def build_task_latency_thresholds(records, method="percentile", value=0.75, round_ndigits=3):
+ """
+ Build latency thresholds per task across models.
+ Parameters
+ ----------
+ """
+ by_task = defaultdict(list)
+ # group latencies by task
+ for r in records:
+ tt = r.get("task_types")
+ lat = r.get("avg_latency")
+ if tt and isinstance(lat, (int, float)) and lat > 0:
+ by_task[tt].append(float(lat))
+ out = {}
+ for tt, lats in by_task.items():
+ arr = np.array(lats, dtype=float)
+ med = float(np.median(arr))
+ if method == "percentile":
+ medium_cutoff = float(np.quantile(arr, value))
+ elif method == "tolerance":
+ medium_cutoff = med * (1 + value)
+ else:
+ raise ValueError("method must be 'percentile' or 'tolerance'")
+ out[tt] = {
+ "good": round(med, round_ndigits),
+ "medium": round(medium_cutoff, round_ndigits)
+ }
+ return out
+
##############################
##############################
@@ -1125,149 +1156,95 @@ def create_integrated_analysis_table(model_task_metrics):
'below': '#ffd4a3', # Orange - within 30% of best
'poor': '#ffcccc' # Light red - more than 30% behind
}
-
- # Group by task and create separate tables
- task_tables = {}
-
- for task in model_task_metrics['task_types'].unique():
- task_data = model_task_metrics[model_task_metrics['task_types'] == task].copy()
-
- if task_data.empty:
- continue
-
- # Calculate average token size for this task
- avg_tokens = task_data['avg_input_tokens'].mean() #task_data['avg_output_tokens'].mean() +
-
- # Get best values for each metric in this task
- # best_success_rate = task_data['success_rate'].max()
- # best_latency = task_data['avg_latency'].min() # Lower is better
- # best_cost = task_data['avg_cost'].min() # Lower is better
- # best_otps = task_data['avg_otps'].max() # Higher is better
-
- # Format metrics for display
- task_data['success_rate_fmt'] = task_data['success_rate'].apply(lambda x: f"{x:.1%}")
- task_data['avg_latency_fmt'] = task_data['avg_latency'].apply(lambda x: f"{x:.2f}s")
- task_data['avg_cost_fmt'] = task_data['avg_cost'].apply(lambda x: f"${x:.4f}")
- task_data['avg_otps_fmt'] = task_data['avg_otps'].apply(lambda x: f"{x:.1f}")
- # task_data['total_tokens_fmt'] = (task_data['avg_input_tokens'] + task_data['avg_output_tokens']).apply(lambda x: f"{x:.0f}")
- task_data['total_tokens_fmt'] = task_data['avg_output_tokens'].apply(lambda x: f"{x:.0f}")
-
- # Calculate composite score
- max_latency = task_data['avg_latency'].max() or 1
- max_cost = task_data['avg_cost'].max() or 1
-
- task_data['composite_score'] = (
- task_data['success_rate'] +
- (1 - (task_data['avg_latency'] / max_latency)) * COMPOSITE_SCORE_WEIGHTS['latency'] +
- (1 - (task_data['avg_cost'] / max_cost)) * COMPOSITE_SCORE_WEIGHTS['cost']
- )
-
- # Sort by composite score descending
- task_data = task_data.sort_values('composite_score', ascending=False)
-
- # Get best composite score for coloring
- # best_composite = task_data['composite_score'].max()
-
- # Helper function to get color based on distance from best
- def get_distance_based_color(value, metric, best_value):
- """
- Colors based on percentage distance from the best performer.
- For 'higher is better' metrics: distance = (best - value) / best
- For 'lower is better' metrics: distance = (value - best) / best
- """
-
- # Calculate percentage difference from best
- if metric in ['success_rate', 'avg_otps']: # Higher is better
- if best_value == 0: # Avoid division by zero
- distance_pct = 100
- else:
- distance_pct = ((best_value - value) / best_value) * 100
-
- # Special case: if value equals best, it's the best
- if value == best_value:
- return colors['best']
-
- else: # Lower is better (latency, cost)
- if best_value == 0: # Avoid division by zero
- distance_pct = 100 if value > 0 else 0
- else:
- distance_pct = ((value - best_value) / best_value) * 100
-
- # Special case: if value equals best, it's the best
- if value == best_value:
- return colors['best']
-
- # Assign color based on distance from best
- if distance_pct <= 5:
- return colors['excellent'] # Within 5% of best
- elif distance_pct <= 10:
- return colors['good'] # Within 10% of best
- elif distance_pct <= 20:
- return colors['medium'] # Within 20% of best
- elif distance_pct <= 30:
- return colors['below'] # Within 30% of best
+
+ # Prepare the data for the table
+ table_data = model_task_metrics.copy()
+
+ thresholds['avg_latency'] = build_task_latency_thresholds(table_data[['model_name', 'task_types', 'avg_latency']].to_dict(orient='records'))
+ # ['avg_output_tokens'].median()
+ # Format Model Name
+ table_data['model_name'] = table_data['model_name'].apply(lambda x: x.split('/')[-1])
+
+ # Format metrics for display
+ table_data['success_rate_fmt'] = table_data['success_rate'].apply(lambda x: f"{x:.1%}")
+ table_data['avg_latency_fmt'] = table_data['avg_latency'].apply(lambda x: f"{x:.2f}s")
+ table_data['avg_cost_fmt'] = table_data['avg_cost'].apply(lambda x: f"${x:.4f}")
+ table_data['avg_otps_fmt'] = table_data['avg_otps'].apply(lambda x: f"{x:.1f}")
+
+ # Calculate composite score (higher is better)
+ # Normalize metrics to 0-1 range and combine them
+ max_latency = table_data['avg_latency'].max() or 1
+ max_cost = table_data['avg_cost'].max() or 1
+
+ table_data['composite_score'] = (
+ table_data['success_rate'] +
+ (1 - (table_data['avg_latency'] / max_latency)) * COMPOSITE_SCORE_WEIGHTS['latency'] +
+ (1 - (table_data['avg_cost'] / max_cost)) * COMPOSITE_SCORE_WEIGHTS['cost']
+ )
+
+ # Create figure
+ fig = go.Figure()
+
+ # Helper function to determine color based on value and thresholds
+ def get_color(value, metric):
+ if metric == 'success_rate' or metric == 'avg_otps':
+ if value >= thresholds[metric]['good']:
+ return colors['good']
+ elif value >= thresholds[metric]['medium']:
+ return colors['medium']
else:
- return colors['poor'] # More than 30% behind
-
- # Create figure for this task
- fig = go.Figure()
-
- # Create table cells with conditional formatting
- fig.add_trace(go.Table(
- header=dict(
- values=['Model', 'Success Rate', 'Latency', 'Cost', 'Tokens/sec', 'Avg Output Tokens', 'Score'],
- font=dict(size=12, color='white'),
- fill_color='#2E5A88',
- align='left'
- ),
- cells=dict(
- values=[
- task_data['model_name'].tolist(),
- task_data['success_rate_fmt'].tolist(),
- task_data['avg_latency_fmt'].tolist(),
- task_data['avg_cost_fmt'].tolist(),
- task_data['avg_otps_fmt'].tolist(),
- task_data['total_tokens_fmt'].tolist(),
- task_data['composite_score'].apply(lambda x: f"{x:.2f}").tolist()
- ],
- align='left',
- font=dict(size=11, color='#333333', family='Arial, sans-serif'), # Explicit dark font
- # Conditional formatting based on distance from best performer
- fill_color=[
- ['white'] * len(task_data), # Model column (no coloring)
- # Success rate coloring (higher is better)
- [get_distance_based_color(sr, 'success_rate', task_data['success_rate'].max()) for sr in task_data['success_rate']],
- # Latency coloring (lower is better)
- [get_distance_based_color(lt, 'avg_latency', task_data['avg_latency'].min()) for lt in task_data['avg_latency']],
- # Cost coloring (lower is better)
- [get_distance_based_color(cost, 'avg_cost', task_data['avg_cost'].min()) for cost in task_data['avg_cost']],
- # OTPS coloring (higher is better)
- [get_distance_based_color(tps, 'avg_otps', task_data['avg_otps'].max()) for tps in task_data['avg_otps']],
- ['#f0f0f0'] * len(task_data), # Avg tokens column (light gray)
- # Composite score coloring based on distance from best
- [get_distance_based_color(score, 'composite_score', task_data['composite_score'].max()) for score in task_data['composite_score']]
- ]
- )
- ))
-
- # Calculate precise height based on content
- header_height = 45 # Height for table header
- row_height = 30 # Height per data row
- title_height = 10 # Space for title and subtitle
- margin_height = 30 # Top and bottom margins
-
- total_height = header_height + (len(task_data) * row_height) + title_height + margin_height
-
- # Update layout with title showing token context
- fig.update_layout(
- title=f'Performance Analysis: {task}
Average Input tokens: {avg_tokens:.0f}',
- title_font=dict(size=16, color='#333333'),
- width=1100,
- height=total_height, # Precise height calculation
- margin=dict(l=20, r=20, b=20, t=60),
- template="plotly_white",
- paper_bgcolor="#ffffff",
- font=dict(color='#333333') # Ensure text is dark on white background
+ return colors['poor']
+ elif metric == 'avg_latency':
+ if value['avg_latency'] <= thresholds[metric][value['task_types']]['good']:
+ return colors['good']
+ else:
+ return colors['medium']
+ else: # For latency and cost, lower is better
+ if value <= thresholds[metric]['good']:
+ return colors['good']
+ elif value <= thresholds[metric]['medium']:
+ return colors['medium']
+ else:
+ return colors['poor']
+
+ # Create table cells with conditional formatting
+ fig.add_trace(go.Table(
+ header=dict(
+ values=['Model', 'Task Type', 'Success Rate', 'Latency', 'Cost', 'Tokens/sec', 'Score'],
+ font=dict(size=12, color='white'),
+ fill_color='#2E5A88',
+ align='left'
+ ),
+ cells=dict(
+ values=[
+ table_data['model_name'],
+ table_data['task_types'],
+ table_data['success_rate_fmt'],
+ table_data['avg_latency_fmt'],
+ table_data['avg_cost_fmt'],
+ table_data['avg_otps_fmt'],
+ table_data['composite_score'].apply(lambda x: f"{x:.2f}")
+ ],
+ align='left',
+ font=dict(size=11),
+ # Conditional formatting based on thresholds
+ fill_color=[
+ ['white'] * len(table_data), # Model column (no coloring)
+ ['white'] * len(table_data), # Task column (no coloring)
+ # Success rate coloring (three-color)
+ [get_color(sr, 'success_rate') for sr in table_data['success_rate']],
+ # Latency coloring (three-color)
+ [get_color(lt, 'avg_latency') for lt in table_data[['avg_latency','task_types']].to_dict(orient='records')],
+ # Cost coloring (three-color)
+ [get_color(cost, 'avg_cost') for cost in table_data['avg_cost']],
+ # OTPS coloring (just use white)
+ # ['white'] * len(table_data),
+ [get_color(tps, 'avg_otps') for tps in table_data['avg_otps']],
+ # Composite score coloring based on quantiles
+ [colors['good'] if score >= table_data['composite_score'].quantile(0.67) else
+ colors['medium'] if score >= table_data['composite_score'].quantile(0.33) else
+ colors['poor'] for score in table_data['composite_score']]
+ ]
)
# Store the table for this task