Skip to content

Commit e224f5c

Browse files
committed
feat(samples): add next batch of supported evaluators and extra functionality to the export evals script
1 parent 9d68674 commit e224f5c

File tree

3 files changed

+65
-14
lines changed

3 files changed

+65
-14
lines changed

samples/gym-sample/export_evals.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22

33
# Usage:
44
# ./export_evals.sh # Export all evaluators
5-
# ./export_evals.sh --only-supported # Export only supported evaluators
5+
# ./export_evals.sh --include_not_supported # Include not supported evaluators
6+
# ./export_evals.sh --exclude_llm_judge # Exclude LLM judge evaluators
7+
# ./export_evals.sh --small_set_size 10 # Export a small set of 10 datapoints
68

79
# Export evaluators and eval sets to UiPath eval format
810
uv run python -m gym_sample.export_evals "$@"

samples/gym-sample/src/gym_sample/calculator/agent.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,9 @@ def get_datapoints() -> List[Datapoint]:
2020
name="TestSimpleAddition",
2121
input={"expression": "how much is 2 + 5"},
2222
evaluation_criteria={
23-
"ExactMatchEvaluator": {"expected_output": {"answer": 7}},
23+
"ExactMatchEvaluator": {"expected_output": {"answer": 7.0}},
2424
"ContainsEvaluator": {"search_text": "7"},
25+
"JsonSimilarityEvaluator": {"expected_output": {"answer": 7.0}},
2526
"ToolCallCountEvaluator": {"tool_calls_count": {"add": ("=", 1), "multiply": ("=", 0)}},
2627
"ToolCallOrderEvaluator": {"tool_calls_order": ["add"]},
2728
"ToolCallArgsEvaluator": {"tool_calls": [{"name": "add", "args": {"a": 2, "b": 5}}]},
@@ -34,6 +35,7 @@ def get_datapoints() -> List[Datapoint]:
3435
evaluation_criteria={
3536
"ExactMatchEvaluator": {"expected_output": {"answer": 10.0}},
3637
"ContainsEvaluator": {"search_text": "10"},
38+
"JsonSimilarityEvaluator": {"expected_output": {"answer": 10.0}},
3739
"ToolCallCountEvaluator": {"tool_calls_count": {"add": ("=", 0), "multiply": ("=", 1)}},
3840
"ToolCallOrderEvaluator": {"tool_calls_order": ["multiply"]},
3941
"ToolCallArgsEvaluator": {"tool_calls": [{"name": "multiply", "args": {"a": 2, "b": 5}}]},
@@ -45,6 +47,7 @@ def get_datapoints() -> List[Datapoint]:
4547
input={"expression": "how much is 5 - 2"},
4648
evaluation_criteria={
4749
"ExactMatchEvaluator": {"expected_output": {"answer": 3.0}},
50+
"JsonSimilarityEvaluator": {"expected_output": {"answer": 3.0}},
4851
"ContainsEvaluator": {"search_text": "3"},
4952
"ToolCallCountEvaluator": {"tool_calls_count": {"add": ("=", 1), "multiply": ("=", 0)}},
5053
"ToolCallOrderEvaluator": {"tool_calls_order": ["add"]},
@@ -58,6 +61,7 @@ def get_datapoints() -> List[Datapoint]:
5861
evaluation_criteria={
5962
"ExactMatchEvaluator": {"expected_output": {"answer": 12.0}},
6063
"ContainsEvaluator": {"search_text": "12"},
64+
"JsonSimilarityEvaluator": {"expected_output": {"answer": 12.0}},
6165
"ToolCallCountEvaluator": {"tool_calls_count": {"add": ("=", 1), "multiply": ("=", 1)}},
6266
"ToolCallOrderEvaluator": {"tool_calls_order": ["multiply", "add"]},
6367
"ToolCallArgsEvaluator": {"tool_calls": [
@@ -73,6 +77,7 @@ def get_datapoints() -> List[Datapoint]:
7377
evaluation_criteria={
7478
"ExactMatchEvaluator": {"expected_output": {"answer": 25.0}},
7579
"ContainsEvaluator": {"search_text": "25"},
80+
"JsonSimilarityEvaluator": {"expected_output": {"answer": 25.0}},
7681
"ToolCallCountEvaluator": {"tool_calls_count": {"add": ("=", 1), "multiply": ("=", 2)}},
7782
"ToolCallOrderEvaluator": {"tool_calls_order": ["multiply", "multiply", "add"]},
7883
"ToolCallArgsEvaluator": {"tool_calls": [
@@ -89,6 +94,7 @@ def get_datapoints() -> List[Datapoint]:
8994
evaluation_criteria={
9095
"ExactMatchEvaluator": {"expected_output": {"answer": 150.0}},
9196
"ContainsEvaluator": {"search_text": "150"},
97+
"JsonSimilarityEvaluator": {"expected_output": {"answer": 150.0}},
9298
"ToolCallCountEvaluator": {"tool_calls_count": {"add": ("=", 1), "multiply": ("=", 2)}},
9399
"ToolCallOrderEvaluator": {"tool_calls_order": ["add", "multiply", "multiply"]},
94100
"ToolCallArgsEvaluator": {"tool_calls": [
@@ -105,6 +111,7 @@ def get_datapoints() -> List[Datapoint]:
105111
evaluation_criteria={
106112
"ExactMatchEvaluator": {"expected_output": {"answer": 14.0}},
107113
"ContainsEvaluator": {"search_text": "14"},
114+
"JsonSimilarityEvaluator": {"expected_output": {"answer": 14.0}},
108115
"ToolCallCountEvaluator": {"tool_calls_count": {"add": ("=", 1), "multiply": ("=", 1)}},
109116
"ToolCallOrderEvaluator": {"tool_calls_order": ["multiply", "add"]},
110117
"ToolCallArgsEvaluator": {"tool_calls": [
@@ -120,6 +127,7 @@ def get_datapoints() -> List[Datapoint]:
120127
evaluation_criteria={
121128
"ExactMatchEvaluator": {"expected_output": {"answer": 25.0}},
122129
"ContainsEvaluator": {"search_text": "25"},
130+
"JsonSimilarityEvaluator": {"expected_output": {"answer": 25.0}},
123131
"ToolCallCountEvaluator": {"tool_calls_count": {"add": ("=", 2), "multiply": ("=", 1)}},
124132
"ToolCallOrderEvaluator": {"tool_calls_order": ["add", "add", "multiply"]},
125133
"ToolCallArgsEvaluator": {"tool_calls": [
@@ -136,6 +144,7 @@ def get_datapoints() -> List[Datapoint]:
136144
evaluation_criteria={
137145
"ExactMatchEvaluator": {"expected_output": {"answer": 8.0}},
138146
"ContainsEvaluator": {"search_text": "8"},
147+
"JsonSimilarityEvaluator": {"expected_output": {"answer": 8.0}},
139148
"ToolCallCountEvaluator": {"tool_calls_count": {"add": ("=", 1), "multiply": ("=", 1)}},
140149
"ToolCallOrderEvaluator": {"tool_calls_order": ["multiply", "add"]},
141150
"ToolCallArgsEvaluator": {"tool_calls": [
@@ -151,6 +160,7 @@ def get_datapoints() -> List[Datapoint]:
151160
evaluation_criteria={
152161
"ExactMatchEvaluator": {"expected_output": {"answer": 15.0}},
153162
"ContainsEvaluator": {"search_text": "15"},
163+
"JsonSimilarityEvaluator": {"expected_output": {"answer": 15.0}},
154164
"ToolCallCountEvaluator": {"tool_calls_count": {"add": ("=", 1), "multiply": ("=", 1)}},
155165
"ToolCallOrderEvaluator": {"tool_calls_order": ["multiply", "add"]},
156166
"ToolCallArgsEvaluator": {"tool_calls": [
@@ -166,6 +176,7 @@ def get_datapoints() -> List[Datapoint]:
166176
evaluation_criteria={
167177
"ExactMatchEvaluator": {"expected_output": {"answer": 20.0}},
168178
"ContainsEvaluator": {"search_text": "20"},
179+
"JsonSimilarityEvaluator": {"expected_output": {"answer": 20.0}},
169180
"ToolCallCountEvaluator": {"tool_calls_count": {"add": ("=", 2), "multiply": ("=", 1)}},
170181
"ToolCallOrderEvaluator": {"tool_calls_order": ["add", "add", "multiply"]},
171182
"ToolCallArgsEvaluator": {"tool_calls": [
@@ -182,6 +193,7 @@ def get_datapoints() -> List[Datapoint]:
182193
evaluation_criteria={
183194
"ExactMatchEvaluator": {"expected_output": {"answer": 28.0}},
184195
"ContainsEvaluator": {"search_text": "28"},
196+
"JsonSimilarityEvaluator": {"expected_output": {"answer": 28.0}},
185197
"ToolCallCountEvaluator": {"tool_calls_count": {"add": ("=", 2), "multiply": ("=", 2)}},
186198
"ToolCallOrderEvaluator": {"tool_calls_order": ["add", "multiply", "add", "multiply"]},
187199
"ToolCallArgsEvaluator": {"tool_calls": [
@@ -199,6 +211,7 @@ def get_datapoints() -> List[Datapoint]:
199211
evaluation_criteria={
200212
"ExactMatchEvaluator": {"expected_output": {"answer": 1.0}},
201213
"ContainsEvaluator": {"search_text": "1"},
214+
"JsonSimilarityEvaluator": {"expected_output": {"answer": 1.0}},
202215
"ToolCallCountEvaluator": {"tool_calls_count": {"add": ("=", 1), "multiply": ("=", 1)}},
203216
"ToolCallOrderEvaluator": {"tool_calls_order": ["multiply", "add"]},
204217
"ToolCallArgsEvaluator": {"tool_calls": [
@@ -214,6 +227,7 @@ def get_datapoints() -> List[Datapoint]:
214227
evaluation_criteria={
215228
"ExactMatchEvaluator": {"expected_output": {"answer": 5.0}},
216229
"ContainsEvaluator": {"search_text": "5"},
230+
"JsonSimilarityEvaluator": {"expected_output": {"answer": 5.0}},
217231
"ToolCallCountEvaluator": {"tool_calls_count": {"add": ("=", 1), "multiply": ("=", 1)}},
218232
"ToolCallOrderEvaluator": {"tool_calls_order": ["add", "multiply"]},
219233
"ToolCallArgsEvaluator": {"tool_calls": [
@@ -229,6 +243,7 @@ def get_datapoints() -> List[Datapoint]:
229243
evaluation_criteria={
230244
"ExactMatchEvaluator": {"expected_output": {"answer": 22.0}},
231245
"ContainsEvaluator": {"search_text": "22"},
246+
"JsonSimilarityEvaluator": {"expected_output": {"answer": 22.0}},
232247
"ToolCallCountEvaluator": {"tool_calls_count": {"escalation": ("=", 1), "add": ("=", 1)}},
233248
"ToolCallOrderEvaluator": {"tool_calls_order": ["escalation", "add"]},
234249
"ToolCallArgsEvaluator": {"tool_calls": [
@@ -243,6 +258,7 @@ def get_datapoints() -> List[Datapoint]:
243258
evaluation_criteria={
244259
"ExactMatchEvaluator": {"expected_output": {"answer": 102.0}},
245260
"ContainsEvaluator": {"search_text": "102"},
261+
"JsonSimilarityEvaluator": {"expected_output": {"answer": 102.0}},
246262
"ToolCallCountEvaluator": {"tool_calls_count": {"escalation": ("=", 2), "multiply": ("=", 1), "add": ("=", 1)}},
247263
"ToolCallOrderEvaluator": {"tool_calls_order": ["escalation", "escalation", "multiply", "add"]},
248264
"ToolCallArgsEvaluator": {"tool_calls": [
@@ -258,6 +274,7 @@ def get_datapoints() -> List[Datapoint]:
258274
evaluation_criteria={
259275
"ExactMatchEvaluator": {"expected_output": {"answer": 311.0}},
260276
"ContainsEvaluator": {"search_text": "311"},
277+
"JsonSimilarityEvaluator": {"expected_output": {"answer": 311.0}},
261278
"ToolCallCountEvaluator": {"tool_calls_count": {"escalation": ("=", 3), "multiply": ("=", 1), "add": ("=", 2)}},
262279
"ToolCallOrderEvaluator": {"tool_calls_order": ["escalation", "escalation", "escalation", "multiply", "add", "add"]},
263280
"ToolCallArgsEvaluator": {"tool_calls": [

samples/gym-sample/src/gym_sample/export_evals.py

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import uuid
1111
from pathlib import Path
1212
from typing import Any, Dict, List
13+
from dotenv import load_dotenv, find_dotenv
1314

1415
from gym_sample.graph import get_agents, get_all_evaluators
1516

@@ -94,7 +95,7 @@ def datapoint_to_evaluation(
9495
}
9596

9697

97-
def export_evaluators(agent_name: str, output_dir: Path, only_supported: bool = False) -> List[str]:
98+
def export_evaluators(agent_name: str, output_dir: Path, only_supported: bool = False, include_llm_judge: bool = False) -> List[str]:
9899
"""Export evaluator specs for an agent.
99100
100101
Args:
@@ -107,11 +108,15 @@ def export_evaluators(agent_name: str, output_dir: Path, only_supported: bool =
107108
"""
108109
# Currently supported evaluators in PR #685
109110
SUPPORTED_EVALUATORS = {
110-
"ContainsEvaluator"
111+
"ContainsEvaluator",
112+
"ExactMatchEvaluator",
113+
"JsonSimilarityEvaluator",
114+
"LLMJudgeOutputEvaluator",
115+
"LLMJudgeStrictJSONSimilarityOutputEvaluator",
111116
}
112117

113118
evaluators_getter = get_all_evaluators()[agent_name]
114-
evaluators = evaluators_getter(False) # Export without LLM judges by default
119+
evaluators = evaluators_getter(include_llm_judge)
115120

116121
output_dir.mkdir(parents=True, exist_ok=True)
117122
evaluator_ids = []
@@ -138,7 +143,8 @@ def export_evaluators(agent_name: str, output_dir: Path, only_supported: bool =
138143
def export_eval_set(
139144
agent_name: str,
140145
evaluator_refs: List[str],
141-
output_dir: Path
146+
output_dir: Path,
147+
small_set_size: int = 0
142148
) -> None:
143149
"""Export an agent's datapoints as a UiPath eval_set JSON file (version 1.0).
144150
@@ -171,10 +177,20 @@ def export_eval_set(
171177
with open(output_path, 'w') as f:
172178
json.dump(eval_set, f)
173179

180+
print(small_set_size)
181+
if small_set_size > 0:
182+
eval_set["evaluations"] = [
183+
datapoint_to_evaluation(dp, eval_set_id, evaluator_refs, agent_name)
184+
for dp in agent.datapoints[:small_set_size]
185+
]
186+
output_path = output_dir / f"evaluation-set-{agent_name}-small.json"
187+
with open(output_path, 'w') as f:
188+
json.dump(eval_set, f)
189+
174190
print(f" ✅ Exported eval set with {len(agent.datapoints)} evaluations")
175191

176192

177-
def export_agent(agent_name: str, base_dir: Path, only_supported: bool = False) -> None:
193+
def export_agent(agent_name: str, base_dir: Path, only_supported: bool = False, include_llm_judge: bool = False, small_set_size: int = 0) -> None:
178194
"""Export all evaluators and eval sets for a single agent.
179195
180196
Args:
@@ -186,11 +202,11 @@ def export_agent(agent_name: str, base_dir: Path, only_supported: bool = False)
186202

187203
# Export evaluators
188204
evaluators_dir = base_dir / "evaluators"
189-
evaluator_ids = export_evaluators(agent_name, evaluators_dir, only_supported)
205+
evaluator_ids = export_evaluators(agent_name, evaluators_dir, only_supported, include_llm_judge)
190206

191207
# Export eval set
192208
eval_sets_dir = base_dir / "eval-sets"
193-
export_eval_set(agent_name, evaluator_ids, eval_sets_dir)
209+
export_eval_set(agent_name, evaluator_ids, eval_sets_dir, small_set_size)
194210

195211
print(f"✨ Completed {agent_name} agent export\n")
196212

@@ -199,21 +215,34 @@ def main() -> None:
199215
"""Export all agent evaluators and eval sets."""
200216
parser = argparse.ArgumentParser(description="Export evaluators and eval sets for agents")
201217
parser.add_argument(
202-
"--only-supported",
218+
"--include_not_supported",
203219
action="store_true",
204-
help="Only export evaluators supported by the current PR (currently: ContainsEvaluator)"
220+
help="Include evaluators not supported by the current PR (currently: ContainsEvaluator)"
221+
)
222+
parser.add_argument(
223+
"--exclude_llm_judge",
224+
action="store_true",
225+
help="Include LLM judge evaluators"
226+
)
227+
parser.add_argument(
228+
"--small_set_size",
229+
type=int,
230+
default=0,
231+
help="Size of the small eval set to export"
205232
)
206233
args, _ = parser.parse_known_args()
207234

235+
load_dotenv(find_dotenv())
236+
208237
# Export to the standard location that uipath eval discovers
209238
base_dir = Path(__file__).parent.parent.parent / "evals"
210239

211240
print("🚀 Starting export of evaluators and eval sets...")
212-
if args.only_supported:
213-
print(" (Only exporting supported evaluators)")
241+
if args.include_not_supported:
242+
print(" (Including not supported evaluators)")
214243

215244
for agent_name in ["calculator", "loan"]:
216-
export_agent(agent_name, base_dir, only_supported=args.only_supported)
245+
export_agent(agent_name, base_dir, only_supported=not args.include_not_supported, include_llm_judge=not args.exclude_llm_judge, small_set_size=args.small_set_size)
217246

218247
print("✅ All exports completed!")
219248
print(f"\n📁 Files exported to: {base_dir.absolute()}")
@@ -223,6 +252,9 @@ def main() -> None:
223252
print(" └── eval-sets/")
224253
print(" ├── evaluation-set-calculator.json")
225254
print(" └── evaluation-set-loan.json")
255+
if args.small_set_size > 0:
256+
print(f" └── evaluation-set-calculator-small.json")
257+
print(f" └── evaluation-set-loan-small.json")
226258

227259

228260
if __name__ == "__main__":

0 commit comments

Comments
 (0)