@@ -141,7 +141,7 @@ export interface EvaluationRetrieveResponse {
141
141
/**
142
142
* The parameters used for this evaluation
143
143
*/
144
- parameters ?: unknown ;
144
+ parameters ?: { [ key : string ] : unknown } ;
145
145
146
146
/**
147
147
* Results of the evaluation (when completed)
@@ -213,10 +213,7 @@ export namespace EvaluationRetrieveResponse {
213
213
}
214
214
215
215
export interface EvaluationScoreResults {
216
- /**
217
- * Aggregated score statistics
218
- */
219
- aggregated_scores ?: unknown ;
216
+ aggregated_scores ?: EvaluationScoreResults . AggregatedScores ;
220
217
221
218
/**
222
219
* number of failed samples generated from model
@@ -244,6 +241,16 @@ export namespace EvaluationRetrieveResponse {
244
241
result_file_id ?: string ;
245
242
}
246
243
244
+ export namespace EvaluationScoreResults {
245
+ export interface AggregatedScores {
246
+ mean_score ?: number ;
247
+
248
+ pass_percentage ?: number ;
249
+
250
+ std_score ?: number ;
251
+ }
252
+ }
253
+
247
254
export interface EvaluationCompareResults {
248
255
/**
249
256
* Number of times model A won
@@ -348,10 +355,7 @@ export namespace EvaluationGetStatusResponse {
348
355
}
349
356
350
357
export interface EvaluationScoreResults {
351
- /**
352
- * Aggregated score statistics
353
- */
354
- aggregated_scores ?: unknown ;
358
+ aggregated_scores ?: EvaluationScoreResults . AggregatedScores ;
355
359
356
360
/**
357
361
* number of failed samples generated from model
@@ -379,6 +383,16 @@ export namespace EvaluationGetStatusResponse {
379
383
result_file_id ?: string ;
380
384
}
381
385
386
+ export namespace EvaluationScoreResults {
387
+ export interface AggregatedScores {
388
+ mean_score ?: number ;
389
+
390
+ pass_percentage ?: number ;
391
+
392
+ std_score ?: number ;
393
+ }
394
+ }
395
+
382
396
export interface EvaluationCompareResults {
383
397
/**
384
398
* Number of times model A won
@@ -527,10 +541,120 @@ export interface EvaluationUpdateStatusParams {
527
541
*/
528
542
error ?: string ;
529
543
530
- /**
531
- * Job results (required when status is 'completed')
532
- */
533
- results ?: unknown ;
544
+ results ?:
545
+ | EvaluationUpdateStatusParams . EvaluationClassifyResults
546
+ | EvaluationUpdateStatusParams . EvaluationScoreResults
547
+ | EvaluationUpdateStatusParams . EvaluationCompareResults ;
548
+ }
549
+
550
+ export namespace EvaluationUpdateStatusParams {
551
+ export interface EvaluationClassifyResults {
552
+ /**
553
+ * Number of failed generations.
554
+ */
555
+ generation_fail_count ?: number | null ;
556
+
557
+ /**
558
+ * Number of invalid labels
559
+ */
560
+ invalid_label_count ?: number | null ;
561
+
562
+ /**
563
+ * Number of failed judge generations
564
+ */
565
+ judge_fail_count ?: number | null ;
566
+
567
+ /**
568
+ * JSON string representing label counts
569
+ */
570
+ label_counts ?: string ;
571
+
572
+ /**
573
+ * Pecentage of pass labels.
574
+ */
575
+ pass_percentage ?: number | null ;
576
+
577
+ /**
578
+ * Data File ID
579
+ */
580
+ result_file_id ?: string ;
581
+ }
582
+
583
+ export interface EvaluationScoreResults {
584
+ aggregated_scores ?: EvaluationScoreResults . AggregatedScores ;
585
+
586
+ /**
587
+ * number of failed samples generated from model
588
+ */
589
+ failed_samples ?: number ;
590
+
591
+ /**
592
+ * Number of failed generations.
593
+ */
594
+ generation_fail_count ?: number | null ;
595
+
596
+ /**
597
+ * number of invalid scores generated from model
598
+ */
599
+ invalid_score_count ?: number ;
600
+
601
+ /**
602
+ * Number of failed judge generations
603
+ */
604
+ judge_fail_count ?: number | null ;
605
+
606
+ /**
607
+ * Data File ID
608
+ */
609
+ result_file_id ?: string ;
610
+ }
611
+
612
+ export namespace EvaluationScoreResults {
613
+ export interface AggregatedScores {
614
+ mean_score ?: number ;
615
+
616
+ pass_percentage ?: number ;
617
+
618
+ std_score ?: number ;
619
+ }
620
+ }
621
+
622
+ export interface EvaluationCompareResults {
623
+ /**
624
+ * Number of times model A won
625
+ */
626
+ A_wins ?: number ;
627
+
628
+ /**
629
+ * Number of times model B won
630
+ */
631
+ B_wins ?: number ;
632
+
633
+ /**
634
+ * Number of failed generations.
635
+ */
636
+ generation_fail_count ?: number | null ;
637
+
638
+ /**
639
+ * Number of failed judge generations
640
+ */
641
+ judge_fail_count ?: number | null ;
642
+
643
+ /**
644
+ * Total number of samples compared
645
+ */
646
+ num_samples ?: number ;
647
+
648
+ /**
649
+ * Data File ID
650
+ */
651
+ result_file_id ?: string ;
652
+
653
+ /**
654
+ * Number of ties
655
+ */
656
+ Ties ?: number ;
657
+ }
534
658
}
535
659
536
660
export declare namespace Evaluation {
0 commit comments