@@ -9,8 +9,18 @@ import {
99 sampleComments ,
1010 sampleClusteringData ,
1111} from "./scorers/clustering-scorers.js" ;
12+ import {
13+ extractionJsonStructureScorer ,
14+ claimQualityScorer ,
15+ taxonomyAlignmentScorer ,
16+ quoteRelevanceScorer ,
17+ extractionCompletenessScorer ,
18+ createExtractionModel ,
19+ extractionTestCases ,
20+ } from "./scorers/extraction-scorers.js" ;
1221import {
1322 defaultClusteringPrompt ,
23+ defaultExtractionPrompt ,
1424 defaultSystemPrompt ,
1525 hydratePromptLiterals ,
1626} from "../prompts/index.js" ;
@@ -34,9 +44,15 @@ const clusteringModel = createClusteringModel(
3444 defaultSystemPrompt ,
3545) ;
3646
37- async function main ( ) {
38- await weave . init ( "t3c-pipeline-evaluation" ) ;
47+ // Create extraction model with system prompt
48+ const extractionModel = createExtractionModel (
49+ openaiClient ,
50+ hydratePromptLiterals ,
51+ defaultExtractionPrompt ,
52+ defaultSystemPrompt ,
53+ ) ;
3954
55+ async function runClusteringEvaluation ( ) {
4056 const clusteringDataset = new weave . Dataset ( {
4157 name : "T3C Clustering Dataset" ,
4258 rows : clusteringExamples ,
@@ -53,10 +69,66 @@ async function main() {
5369 } ) ;
5470
5571 console . log ( "Running T3C clustering evaluation..." ) ;
56- const results = await clusteringEvaluation . evaluate ( {
72+ const clusteringResults = await clusteringEvaluation . evaluate ( {
5773 model : clusteringModel ,
5874 } ) ;
59- console . log ( "Results:" , JSON . stringify ( results , null , 2 ) ) ;
75+ console . log (
76+ "Clustering Results:" ,
77+ JSON . stringify ( clusteringResults , null , 2 ) ,
78+ ) ;
79+ return clusteringResults ;
80+ }
81+
82+ async function runExtractionEvaluation ( ) {
83+ const extractionDataset = new weave . Dataset ( {
84+ name : "T3C Extraction Dataset" ,
85+ rows : extractionTestCases ,
86+ } ) ;
87+
88+ const extractionEvaluation = new weave . Evaluation ( {
89+ dataset : extractionDataset ,
90+ scorers : [
91+ extractionJsonStructureScorer ,
92+ claimQualityScorer ,
93+ taxonomyAlignmentScorer ,
94+ quoteRelevanceScorer ,
95+ extractionCompletenessScorer ,
96+ ] ,
97+ } ) ;
98+
99+ console . log ( "Running T3C extraction evaluation..." ) ;
100+ const extractionResults = await extractionEvaluation . evaluate ( {
101+ model : extractionModel ,
102+ } ) ;
103+ console . log (
104+ "Extraction Results:" ,
105+ JSON . stringify ( extractionResults , null , 2 ) ,
106+ ) ;
107+ return extractionResults ;
108+ }
109+
110+ async function main ( ) {
111+ await weave . init ( "t3c-pipeline-evaluation" ) ;
112+
113+ const evaluationType = process . argv [ 2 ] ;
114+
115+ switch ( evaluationType ) {
116+ case "clustering" :
117+ console . log ( "Running clustering evaluation...\n" ) ;
118+ await runClusteringEvaluation ( ) ;
119+ break ;
120+ case "extraction" :
121+ console . log ( "Running extraction evaluation...\n" ) ;
122+ await runExtractionEvaluation ( ) ;
123+ break ;
124+ default :
125+ console . log (
126+ "Running full T3C pipeline evaluation (clustering + extraction)...\n" ,
127+ ) ;
128+ await runClusteringEvaluation ( ) ;
129+ await runExtractionEvaluation ( ) ;
130+ break ;
131+ }
60132}
61133
62134main ( ) ;
0 commit comments