Skip to content

Commit 2048ac3

Browse files
committed
adds extraction evaluation to the pipeline evaluation
1 parent d2494bd commit 2048ac3

File tree

4 files changed

+1005
-5
lines changed

4 files changed

+1005
-5
lines changed

common/evaluations/pipeline-evaluate.ts

Lines changed: 76 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,18 @@ import {
99
sampleComments,
1010
sampleClusteringData,
1111
} from "./scorers/clustering-scorers.js";
12+
import {
13+
extractionJsonStructureScorer,
14+
claimQualityScorer,
15+
taxonomyAlignmentScorer,
16+
quoteRelevanceScorer,
17+
extractionCompletenessScorer,
18+
createExtractionModel,
19+
extractionTestCases,
20+
} from "./scorers/extraction-scorers.js";
1221
import {
1322
defaultClusteringPrompt,
23+
defaultExtractionPrompt,
1424
defaultSystemPrompt,
1525
hydratePromptLiterals,
1626
} from "../prompts/index.js";
@@ -34,9 +44,15 @@ const clusteringModel = createClusteringModel(
3444
defaultSystemPrompt,
3545
);
3646

37-
async function main() {
38-
await weave.init("t3c-pipeline-evaluation");
47+
// Create extraction model with system prompt
48+
const extractionModel = createExtractionModel(
49+
openaiClient,
50+
hydratePromptLiterals,
51+
defaultExtractionPrompt,
52+
defaultSystemPrompt,
53+
);
3954

55+
async function runClusteringEvaluation() {
4056
const clusteringDataset = new weave.Dataset({
4157
name: "T3C Clustering Dataset",
4258
rows: clusteringExamples,
@@ -53,10 +69,66 @@ async function main() {
5369
});
5470

5571
console.log("Running T3C clustering evaluation...");
56-
const results = await clusteringEvaluation.evaluate({
72+
const clusteringResults = await clusteringEvaluation.evaluate({
5773
model: clusteringModel,
5874
});
59-
console.log("Results:", JSON.stringify(results, null, 2));
75+
console.log(
76+
"Clustering Results:",
77+
JSON.stringify(clusteringResults, null, 2),
78+
);
79+
return clusteringResults;
80+
}
81+
82+
async function runExtractionEvaluation() {
83+
const extractionDataset = new weave.Dataset({
84+
name: "T3C Extraction Dataset",
85+
rows: extractionTestCases,
86+
});
87+
88+
const extractionEvaluation = new weave.Evaluation({
89+
dataset: extractionDataset,
90+
scorers: [
91+
extractionJsonStructureScorer,
92+
claimQualityScorer,
93+
taxonomyAlignmentScorer,
94+
quoteRelevanceScorer,
95+
extractionCompletenessScorer,
96+
],
97+
});
98+
99+
console.log("Running T3C extraction evaluation...");
100+
const extractionResults = await extractionEvaluation.evaluate({
101+
model: extractionModel,
102+
});
103+
console.log(
104+
"Extraction Results:",
105+
JSON.stringify(extractionResults, null, 2),
106+
);
107+
return extractionResults;
108+
}
109+
110+
async function main() {
111+
await weave.init("t3c-pipeline-evaluation");
112+
113+
const evaluationType = process.argv[2];
114+
115+
switch (evaluationType) {
116+
case "clustering":
117+
console.log("Running clustering evaluation...\n");
118+
await runClusteringEvaluation();
119+
break;
120+
case "extraction":
121+
console.log("Running extraction evaluation...\n");
122+
await runExtractionEvaluation();
123+
break;
124+
default:
125+
console.log(
126+
"Running full T3C pipeline evaluation (clustering + extraction)...\n",
127+
);
128+
await runClusteringEvaluation();
129+
await runExtractionEvaluation();
130+
break;
131+
}
60132
}
61133

62134
main();

0 commit comments

Comments
 (0)