@@ -20,8 +20,9 @@ def get_datapoints() -> List[Datapoint]:
2020 name = "TestSimpleAddition" ,
2121 input = {"expression" : "how much is 2 + 5" },
2222 evaluation_criteria = {
23- "ExactMatchEvaluator" : {"expected_output" : {"answer" : 7 }},
23+ "ExactMatchEvaluator" : {"expected_output" : {"answer" : 7.0 }},
2424 "ContainsEvaluator" : {"search_text" : "7" },
25+ "JsonSimilarityEvaluator" : {"expected_output" : {"answer" : 7.0 }},
2526 "ToolCallCountEvaluator" : {"tool_calls_count" : {"add" : ("=" , 1 ), "multiply" : ("=" , 0 )}},
2627 "ToolCallOrderEvaluator" : {"tool_calls_order" : ["add" ]},
2728 "ToolCallArgsEvaluator" : {"tool_calls" : [{"name" : "add" , "args" : {"a" : 2 , "b" : 5 }}]},
@@ -34,6 +35,7 @@ def get_datapoints() -> List[Datapoint]:
3435 evaluation_criteria = {
3536 "ExactMatchEvaluator" : {"expected_output" : {"answer" : 10.0 }},
3637 "ContainsEvaluator" : {"search_text" : "10" },
38+ "JsonSimilarityEvaluator" : {"expected_output" : {"answer" : 10.0 }},
3739 "ToolCallCountEvaluator" : {"tool_calls_count" : {"add" : ("=" , 0 ), "multiply" : ("=" , 1 )}},
3840 "ToolCallOrderEvaluator" : {"tool_calls_order" : ["multiply" ]},
3941 "ToolCallArgsEvaluator" : {"tool_calls" : [{"name" : "multiply" , "args" : {"a" : 2 , "b" : 5 }}]},
@@ -45,6 +47,7 @@ def get_datapoints() -> List[Datapoint]:
4547 input = {"expression" : "how much is 5 - 2" },
4648 evaluation_criteria = {
4749 "ExactMatchEvaluator" : {"expected_output" : {"answer" : 3.0 }},
50+ "JsonSimilarityEvaluator" : {"expected_output" : {"answer" : 3.0 }},
4851 "ContainsEvaluator" : {"search_text" : "3" },
4952 "ToolCallCountEvaluator" : {"tool_calls_count" : {"add" : ("=" , 1 ), "multiply" : ("=" , 0 )}},
5053 "ToolCallOrderEvaluator" : {"tool_calls_order" : ["add" ]},
@@ -58,6 +61,7 @@ def get_datapoints() -> List[Datapoint]:
5861 evaluation_criteria = {
5962 "ExactMatchEvaluator" : {"expected_output" : {"answer" : 12.0 }},
6063 "ContainsEvaluator" : {"search_text" : "12" },
64+ "JsonSimilarityEvaluator" : {"expected_output" : {"answer" : 12.0 }},
6165 "ToolCallCountEvaluator" : {"tool_calls_count" : {"add" : ("=" , 1 ), "multiply" : ("=" , 1 )}},
6266 "ToolCallOrderEvaluator" : {"tool_calls_order" : ["multiply" , "add" ]},
6367 "ToolCallArgsEvaluator" : {"tool_calls" : [
@@ -73,6 +77,7 @@ def get_datapoints() -> List[Datapoint]:
7377 evaluation_criteria = {
7478 "ExactMatchEvaluator" : {"expected_output" : {"answer" : 25.0 }},
7579 "ContainsEvaluator" : {"search_text" : "25" },
80+ "JsonSimilarityEvaluator" : {"expected_output" : {"answer" : 25.0 }},
7681 "ToolCallCountEvaluator" : {"tool_calls_count" : {"add" : ("=" , 1 ), "multiply" : ("=" , 2 )}},
7782 "ToolCallOrderEvaluator" : {"tool_calls_order" : ["multiply" , "multiply" , "add" ]},
7883 "ToolCallArgsEvaluator" : {"tool_calls" : [
@@ -89,6 +94,7 @@ def get_datapoints() -> List[Datapoint]:
8994 evaluation_criteria = {
9095 "ExactMatchEvaluator" : {"expected_output" : {"answer" : 150.0 }},
9196 "ContainsEvaluator" : {"search_text" : "150" },
97+ "JsonSimilarityEvaluator" : {"expected_output" : {"answer" : 150.0 }},
9298 "ToolCallCountEvaluator" : {"tool_calls_count" : {"add" : ("=" , 1 ), "multiply" : ("=" , 2 )}},
9399 "ToolCallOrderEvaluator" : {"tool_calls_order" : ["add" , "multiply" , "multiply" ]},
94100 "ToolCallArgsEvaluator" : {"tool_calls" : [
@@ -105,6 +111,7 @@ def get_datapoints() -> List[Datapoint]:
105111 evaluation_criteria = {
106112 "ExactMatchEvaluator" : {"expected_output" : {"answer" : 14.0 }},
107113 "ContainsEvaluator" : {"search_text" : "14" },
114+ "JsonSimilarityEvaluator" : {"expected_output" : {"answer" : 14.0 }},
108115 "ToolCallCountEvaluator" : {"tool_calls_count" : {"add" : ("=" , 1 ), "multiply" : ("=" , 1 )}},
109116 "ToolCallOrderEvaluator" : {"tool_calls_order" : ["multiply" , "add" ]},
110117 "ToolCallArgsEvaluator" : {"tool_calls" : [
@@ -120,6 +127,7 @@ def get_datapoints() -> List[Datapoint]:
120127 evaluation_criteria = {
121128 "ExactMatchEvaluator" : {"expected_output" : {"answer" : 25.0 }},
122129 "ContainsEvaluator" : {"search_text" : "25" },
130+ "JsonSimilarityEvaluator" : {"expected_output" : {"answer" : 25.0 }},
123131 "ToolCallCountEvaluator" : {"tool_calls_count" : {"add" : ("=" , 2 ), "multiply" : ("=" , 1 )}},
124132 "ToolCallOrderEvaluator" : {"tool_calls_order" : ["add" , "add" , "multiply" ]},
125133 "ToolCallArgsEvaluator" : {"tool_calls" : [
@@ -136,6 +144,7 @@ def get_datapoints() -> List[Datapoint]:
136144 evaluation_criteria = {
137145 "ExactMatchEvaluator" : {"expected_output" : {"answer" : 8.0 }},
138146 "ContainsEvaluator" : {"search_text" : "8" },
147+ "JsonSimilarityEvaluator" : {"expected_output" : {"answer" : 8.0 }},
139148 "ToolCallCountEvaluator" : {"tool_calls_count" : {"add" : ("=" , 1 ), "multiply" : ("=" , 1 )}},
140149 "ToolCallOrderEvaluator" : {"tool_calls_order" : ["multiply" , "add" ]},
141150 "ToolCallArgsEvaluator" : {"tool_calls" : [
@@ -151,6 +160,7 @@ def get_datapoints() -> List[Datapoint]:
151160 evaluation_criteria = {
152161 "ExactMatchEvaluator" : {"expected_output" : {"answer" : 15.0 }},
153162 "ContainsEvaluator" : {"search_text" : "15" },
163+ "JsonSimilarityEvaluator" : {"expected_output" : {"answer" : 15.0 }},
154164 "ToolCallCountEvaluator" : {"tool_calls_count" : {"add" : ("=" , 1 ), "multiply" : ("=" , 1 )}},
155165 "ToolCallOrderEvaluator" : {"tool_calls_order" : ["multiply" , "add" ]},
156166 "ToolCallArgsEvaluator" : {"tool_calls" : [
@@ -166,6 +176,7 @@ def get_datapoints() -> List[Datapoint]:
166176 evaluation_criteria = {
167177 "ExactMatchEvaluator" : {"expected_output" : {"answer" : 20.0 }},
168178 "ContainsEvaluator" : {"search_text" : "20" },
179+ "JsonSimilarityEvaluator" : {"expected_output" : {"answer" : 20.0 }},
169180 "ToolCallCountEvaluator" : {"tool_calls_count" : {"add" : ("=" , 2 ), "multiply" : ("=" , 1 )}},
170181 "ToolCallOrderEvaluator" : {"tool_calls_order" : ["add" , "add" , "multiply" ]},
171182 "ToolCallArgsEvaluator" : {"tool_calls" : [
@@ -182,6 +193,7 @@ def get_datapoints() -> List[Datapoint]:
182193 evaluation_criteria = {
183194 "ExactMatchEvaluator" : {"expected_output" : {"answer" : 28.0 }},
184195 "ContainsEvaluator" : {"search_text" : "28" },
196+ "JsonSimilarityEvaluator" : {"expected_output" : {"answer" : 28.0 }},
185197 "ToolCallCountEvaluator" : {"tool_calls_count" : {"add" : ("=" , 2 ), "multiply" : ("=" , 2 )}},
186198 "ToolCallOrderEvaluator" : {"tool_calls_order" : ["add" , "multiply" , "add" , "multiply" ]},
187199 "ToolCallArgsEvaluator" : {"tool_calls" : [
@@ -199,6 +211,7 @@ def get_datapoints() -> List[Datapoint]:
199211 evaluation_criteria = {
200212 "ExactMatchEvaluator" : {"expected_output" : {"answer" : 1.0 }},
201213 "ContainsEvaluator" : {"search_text" : "1" },
214+ "JsonSimilarityEvaluator" : {"expected_output" : {"answer" : 1.0 }},
202215 "ToolCallCountEvaluator" : {"tool_calls_count" : {"add" : ("=" , 1 ), "multiply" : ("=" , 1 )}},
203216 "ToolCallOrderEvaluator" : {"tool_calls_order" : ["multiply" , "add" ]},
204217 "ToolCallArgsEvaluator" : {"tool_calls" : [
@@ -214,6 +227,7 @@ def get_datapoints() -> List[Datapoint]:
214227 evaluation_criteria = {
215228 "ExactMatchEvaluator" : {"expected_output" : {"answer" : 5.0 }},
216229 "ContainsEvaluator" : {"search_text" : "5" },
230+ "JsonSimilarityEvaluator" : {"expected_output" : {"answer" : 5.0 }},
217231 "ToolCallCountEvaluator" : {"tool_calls_count" : {"add" : ("=" , 1 ), "multiply" : ("=" , 1 )}},
218232 "ToolCallOrderEvaluator" : {"tool_calls_order" : ["add" , "multiply" ]},
219233 "ToolCallArgsEvaluator" : {"tool_calls" : [
@@ -229,6 +243,7 @@ def get_datapoints() -> List[Datapoint]:
229243 evaluation_criteria = {
230244 "ExactMatchEvaluator" : {"expected_output" : {"answer" : 22.0 }},
231245 "ContainsEvaluator" : {"search_text" : "22" },
246+ "JsonSimilarityEvaluator" : {"expected_output" : {"answer" : 22.0 }},
232247 "ToolCallCountEvaluator" : {"tool_calls_count" : {"escalation" : ("=" , 1 ), "add" : ("=" , 1 )}},
233248 "ToolCallOrderEvaluator" : {"tool_calls_order" : ["escalation" , "add" ]},
234249 "ToolCallArgsEvaluator" : {"tool_calls" : [
@@ -243,6 +258,7 @@ def get_datapoints() -> List[Datapoint]:
243258 evaluation_criteria = {
244259 "ExactMatchEvaluator" : {"expected_output" : {"answer" : 102.0 }},
245260 "ContainsEvaluator" : {"search_text" : "102" },
261+ "JsonSimilarityEvaluator" : {"expected_output" : {"answer" : 102.0 }},
246262 "ToolCallCountEvaluator" : {"tool_calls_count" : {"escalation" : ("=" , 2 ), "multiply" : ("=" , 1 ), "add" : ("=" , 1 )}},
247263 "ToolCallOrderEvaluator" : {"tool_calls_order" : ["escalation" , "escalation" , "multiply" , "add" ]},
248264 "ToolCallArgsEvaluator" : {"tool_calls" : [
@@ -258,6 +274,7 @@ def get_datapoints() -> List[Datapoint]:
258274 evaluation_criteria = {
259275 "ExactMatchEvaluator" : {"expected_output" : {"answer" : 311.0 }},
260276 "ContainsEvaluator" : {"search_text" : "311" },
277+ "JsonSimilarityEvaluator" : {"expected_output" : {"answer" : 311.0 }},
261278 "ToolCallCountEvaluator" : {"tool_calls_count" : {"escalation" : ("=" , 3 ), "multiply" : ("=" , 1 ), "add" : ("=" , 2 )}},
262279 "ToolCallOrderEvaluator" : {"tool_calls_order" : ["escalation" , "escalation" , "escalation" , "multiply" , "add" , "add" ]},
263280 "ToolCallArgsEvaluator" : {"tool_calls" : [
0 commit comments