Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion packages/benchmarks/src/quizQuestions/QuizQuestionEval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ export function runQuizQuestionEval({
reasoning_enabled: modelId.includes("gemini-2.5") ? true : undefined,
reasoning_budget: modelId.includes("gemini-2.5") ? 1024 : undefined,
};

console.log("reasoningOptions", reasoningOptions);
llmOptions = {
...reasoningOptions,
...(llmOptions ?? {}),
Expand Down
79 changes: 67 additions & 12 deletions packages/benchmarks/src/quizQuestions/bin/loadBadgeQuizQuestions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,36 +2,91 @@ import fs from "fs";
import path from "path";
import csv from "csv-parser";
import { QuizQuestionData, QuizQuestionDataSchema } from "../QuizQuestionData";
import { makeTags } from "./makeTags";

const testDataPath = path.resolve(__dirname, "..", "..", "..", "testData");
const csvFileInPath = path.resolve(testDataPath, "badge-questions.csv");
const jsonFileOutPath = path.resolve(testDataPath, "badge-questions.json");

const handleAnswers = (row: any) => {
const correctAnswers = row.Answer.trim()?.split("") || [];
Comment on lines +10 to +11
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

instead of passing row: any, please make sure that the input is strongly typed. you can use zod to do any validation that you need.

const answers = ["A", "B", "C", "D", "E", "F"]
.map((label) => {
const isCorrect = correctAnswers.includes(label);
return {
answer: row[label],
isCorrect,
label,
};
})
.filter((answer) => answer.answer && answer.answer.trim() !== ""); // Remove empty answers
Comment on lines +12 to +21
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think this code could be a little cleaner. rather than filtering at the end, can you slice the array of ["A", "B", "C", ...] to the correct length before running the map over it?

return answers;
};

// const createTagsFromAssessmentName = (assessmentName: string) => {
// return assessmentName.split(",").map((tag) => tag.trim());
// };

const assessmentNameToTagsMap = {
'MongoDB Aggregation Fundamentals': ['aggregation'],
'MongoDB Query Optimization Techniques': ['query'],
"From Relational Model (SQL) to MongoDB's Document Model": ['data_modeling'],
'MongoDB Schema Design Patterns and Antipatterns': ['data_modeling'],
'MongoDB Advanced Schema Design Patterns and Antipatterns': ['data_modeling'],
'MongoDB Schema Design Optimization': ['data_modeling'],
'Building AI Agents with MongoDB': ['gen_ai'],
'Building AI-Powered Search with MongoDB Vector Search': ['gen_ai'],
'Building RAG Apps Using MongoDB': ['gen_ai'],
'MongoDB Indexing Design Fundamentals': ['indexing'],
'Monitoring MongoDB with Built-in Tools': ['monitoring_tuning_and_automation'],
'Optimizing MongoDB Performance with Tuning Tools': ['monitoring_tuning_and_automation'],
'CRUD Operations in MongoDB': ['query'],
'Search with MongoDB': ['search'],
'Securing MongoDB Atlas: Authentication & Authorization': ['security'],
'Securing MongoDB Self-Managed: Authentication & Authorization': ['security'],
'MongoDB Sharding Strategies': ['sharding'],
'Optimizing and Maintaining MongoDB Cluster Reliability': ['performance_at_scale'],
};

// excluded:
// 'MongoDB Overview: Core Concepts and Architecture'

const parseCSV = async (filePath: string): Promise<QuizQuestionData[]> => {
return new Promise((resolve, reject) => {
const results: QuizQuestionData[] = [];
const assessments = new Set<string>();
fs.createReadStream(filePath)
.pipe(csv())
.on("data", (row) => {
// console.log("HIT TRY");
try {
const answers = ["A", "B", "C", "D"].map((label, index) => ({
answer: row[label],
isCorrect: row.Answer === (index + 1).toString(),
label,
}));

const assessmentName = row["Assessment"]?.trim();
if (!assessmentName) {
console.warn("Skipping row with missing assessment name");
return;
}

// Type guard to ensure assessmentName is a valid key
if (assessmentName in assessmentNameToTagsMap) {
console.log('>> tags', assessmentNameToTagsMap[assessmentName as keyof typeof assessmentNameToTagsMap]);
} else {
console.warn(`Assessment name not found in map: "${assessmentName}"`);
}

const answers = handleAnswers(row);
const questionData: QuizQuestionData = QuizQuestionDataSchema.parse({
questionText: row["Question Text"],
title: row["Assessment"],
title: assessmentName,
topicType: "badge", // Defaulting topic type
questionType: "singleCorrect", // Assuming single correct answer
questionType:
row["Answer"].length > 1 ? "multipleCorrect" : "singleCorrect",
Comment on lines +80 to +81
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

consider doing more typesafe parsing rather than checking string length (which isn't very resilient). for example, there could be an err in the spreadsheet

answers,
explanation: row["Reference"],
tags: row["tags"] ? row["tags"].split(",") : [],
// tags: row["tags"] ? row["tags"].split(",") : [],
tags: assessmentName in assessmentNameToTagsMap
? assessmentNameToTagsMap[assessmentName as keyof typeof assessmentNameToTagsMap]
: [],
});
questionData.tags = makeTags(questionData);
results.push(questionData);
// console.log(">>>> assessments >>>>", assessments);
} catch (error) {
console.error("Validation error:", error);
}
Expand Down
5 changes: 1 addition & 4 deletions packages/benchmarks/src/quizQuestions/bin/makeTags.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import mongoDbMetadata from "mongodb-rag-core/mongoDbMetadata";
import { mongoDbProductNames, mongoDbTopics, mongoDbProgrammingLanguages } from "mongodb-rag-core/mongoDbMetadata";
import { QuizQuestionData } from "../QuizQuestionData";

const { mongoDbProductNames, mongoDbTopics, mongoDbProgrammingLanguages } =
mongoDbMetadata;

const programmingLanguageTags = [
...mongoDbProgrammingLanguages
.map((pl) => pl.id)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,46 @@ import { aggregateExperimentScoreMean } from "../../reporting/aggregateExperimen
const { BRAINTRUST_API_KEY } = assertEnvVars(BRAINTRUST_ENV_VARS);
const projectName = "mongodb-multiple-choice";
const experiments = [
{ experimentName: "mistral-large-2", model: "Mistral Large 2" },
{ experimentName: "gemini-2-flash", model: "Gemini 2 Flash" },
{ experimentName: "claude-35-sonnet-v2", model: "Claude 3.5 Sonnet v2" },
{ experimentName: "llama-3.1-70b", model: "Llama 3.1 70B" },
{ experimentName: "nova-pro-v1:0", model: "Nova Pro v1" },
{ experimentName: "gpt-4o", model: "GPT-4o" },
{
model: "GPT 5",
experimentName: "multiple_choice?experimentType=answer_question&model=gpt-5&datasets=mdbu_quiz_all-ad20cb9f",
},
{
model: "GPT 5 Mini",
experimentName: "multiple_choice?experimentType=answer_question&model=gpt-5-mini&datasets=mdbu_quiz_all-ab823151",
},
{
model: "GPT 4o",
experimentName: "multiple_choice?experimentType=answer_question&model=gpt-4o&datasets=mdbu_quiz_all-80e9b77e",
},
{
model: "GPT 4o Mini",
experimentName: "multiple_choice?experimentType=answer_question&model=gpt-4o-mini&datasets=mdbu_quiz_all",
},
{
model: "Gemini 2.5 Pro",
experimentName: "multiple_choice?experimentType=answer_question&model=gemini-2.5-pro&datasets=mdbu_quiz_all",
},
{
model: "Gemini 2.5 Flash",
experimentName: "multiple_choice?experimentType=answer_question&model=gemini-2.5-flash&datasets=mdbu_quiz_all",
},
{
model: "Claude 4.1 Opus",
experimentName: "multiple_choice?experimentType=answer_question&model=claude-opus-4.1&datasets=mdbu_quiz_all",
},
{
model: "Claude 4 Sonnet",
experimentName: "multiple_choice?experimentType=answer_question&model=claude-sonnet-4&datasets=mdbu_quiz_all",
},
{
model: "Claude 3.7 Sonnet",
experimentName: "multiple_choice?experimentType=answer_question&model=claude-37-sonnet&datasets=mdbu_quiz_all",
},
{
model: "Mistral Large 2",
experimentName: "multiple_choice?experimentType=answer_question&model=mistral-large-2&datasets=mdbu_quiz_all",
},
];

const basePathOut = path.resolve(__dirname, "..", "..", "..", "testData");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,56 +19,44 @@ const { BRAINTRUST_API_KEY } = assertEnvVars(BRAINTRUST_ENV_VARS);
const projectName = "mongodb-multiple-choice";
const experiments = [
{
model: "GPT-4o",
experimentName: "gpt-4o-badge-631d3a9b",
model: "GPT 5",
experimentName: "multiple_choice?experimentType=answer_question&model=gpt-5&datasets=mdbu_quiz_badge-0bf72a0a",
},
{
model: "Claude 3.5 Sonnet v2",
experimentName: "claude-35-sonnet-v2-badge-cb743d9f",
model: "GPT 5 Mini",
experimentName: "multiple_choice?experimentType=answer_question&model=gpt-5-mini&datasets=mdbu_quiz_badge-7ed6151c",
},
{
model: "Claude 3.5 Sonnet",
experimentName: "claude-35-sonnet-badge-f3427e16",
model: "GPT 4o",
experimentName: "multiple_choice?experimentType=answer_question&model=gpt-4o&datasets=mdbu_quiz_badge-3de142aa",
},
{ model: "Gemini 2 Flash", experimentName: "gemini-2-flash-badge-76fea4f5" },
{
model: "Claude 3.5 Haiku",
experimentName: "claude-35-haiku-badge-4f4d32bb",
model: "GPT 4o Mini",
experimentName: "multiple_choice?experimentType=answer_question&model=gpt-4o-mini&datasets=mdbu_quiz_badge-470f00b2",
},
{ model: "Nova Pro v1", experimentName: "nova-pro-v1:0-badge-e76a0833" },
{ model: "Llama 3.1 70B", experimentName: "llama-3.1-70b-badge-f2e28e86" },
{ model: "Llama 3.2 90B", experimentName: "llama-3.2-90b-badge-81111f12" },
{
model: "Claude 3.5 Haiku",
experimentName: "claude-35-haiku-badge-4f4d32bb",
model: "Gemini 2.5 Pro",
experimentName: "multiple_choice?experimentType=answer_question&model=gemini-2.5-pro&datasets=mdbu_quiz_badge-9be5ad74",
},
{
model: "Gemini 1.5 Flash",
experimentName: "gemini-1.5-flash-002-badge-e0141bec",
model: "Gemini 2.5 Flash",
experimentName: "multiple_choice?experimentType=answer_question&model=gemini-2.5-flash&datasets=mdbu_quiz_badge-1cc693a9",
},
{ model: "Nova Lite v1", experimentName: "nova-lite-v1:0-badge-c896c5f3" },
{ model: "Llama 3 70B", experimentName: "llama-3-70b-badge-54545f72" },
{ model: "GPT-4o Mini", experimentName: "gpt-4o-mini-badge" },
{
model: "Gemini 1.5 Pro",
experimentName: "gemini-1.5-pro-002-badge-fc8268f2",
model: "Claude 4.1 Opus",
experimentName: "multiple_choice?experimentType=answer_question&model=claude-opus-4.1&datasets=mdbu_quiz_badge-6308714f",
},
{
model: "GPT-35 Turbo 16k",
experimentName: "gpt-35-turbo-16k-badge-6282561d",
model: "Claude 4 Sonnet",
experimentName: "multiple_choice?experimentType=answer_question&model=claude-sonnet-4&datasets=mdbu_quiz_badge-98f0f3af",
},
{
model: "Gemini 1.0 Pro",
experimentName: "gemini-1.0-pro-002-badge-62c646b1",
model: "Claude 3.7 Sonnet",
experimentName: "multiple_choice?experimentType=answer_question&model=claude-37-sonnet&datasets=mdbu_quiz_badge-d9c09975",
},
{ model: "Nova Micro v1", experimentName: "nova-micro-v1:0-badge-e415a9d7" },
{
model: "Mistral Large 2",
experimentName: "mistral-large-2-badge-2526d48b",
},
{
model: "Claude 3 Sonnet",
experimentName: "claude-3-sonnet-badge-a8791d43",
experimentName: "multiple_choice?experimentType=answer_question&model=mistral-large-2&datasets=mdbu_quiz_badge-5aad3637",
},
];

Expand Down Expand Up @@ -127,15 +115,53 @@ function createCsvHeaders(data: CsvData[]): CsvHeader[] {
}));
}

/**
Extracts model name from experiment name.
The model name appears between "&model=" and "&datasets".

@param experimentName The experiment name to extract model from.
@returns The extracted model name.
*/
function extractModelName(experimentName: string): string {
const modelMatch = experimentName.match(/&model=([^&]+)&datasets/);
return modelMatch ? modelMatch[1] : 'unknown';
}

/**
Type definition for detailed quiz results CSV data.
*/
type DetailedQuizResult = {
title: string;
questionText: string;
answers: string;
expected: string;
"Correct Count": number;
[key: string]: string | number; // Dynamic model columns and Correct Count
};

async function main() {
const outputDir = path.join(basePathOut, "csv", "badge");
ensureOutputDirectory(outputDir);

const titleTags = [
"Relational to Document Model",
"Schema Patterns and Antipatterns",
"Schema Design Optimization",
"Advanced Schema Patterns and Antipatterns",
"MongoDB Aggregation Fundamentals",
"MongoDB Query Optimization Techniques",
"From Relational Model (SQL) to MongoDB's Document Model",
"MongoDB Schema Design Patterns and Antipatterns",
"MongoDB Advanced Schema Design Patterns and Antipatterns",
"MongoDB Schema Design Optimization",
"Building AI Agents with MongoDB",
"Building AI-Powered Search with MongoDB Vector Search",
"Building RAG Apps Using MongoDB",
"MongoDB Indexing Design Fundamentals",
"Monitoring MongoDB with Built-in Tools",
"Optimizing MongoDB Performance with Tuning Tools",
"CRUD Operations in MongoDB",
"Search with MongoDB",
"Securing MongoDB Atlas: Authentication & Authorization",
"Securing MongoDB Self-Managed: Authentication & Authorization",
"MongoDB Sharding Strategies",
"Optimizing and Maintaining MongoDB Cluster Reliability",
] as const;

// Define a type for the quiz titles
Expand All @@ -148,6 +174,7 @@ async function main() {
} & Partial<Record<QuizTitle, number>>;

const experimentAggregates: ExperimentAggregate[] = [];
const detailedResults: DetailedQuizResult[] = [];

// Process each experiment
for (const { experimentName, model } of experiments) {
Expand All @@ -164,6 +191,39 @@ async function main() {
apiKey: BRAINTRUST_API_KEY,
});

// Extract model name from experiment name
const extractedModelName = extractModelName(experimentName);

// Process detailed results for the new CSV
results.forEach((result) => {
// Find existing detailed result or create new one
let detailedResult = detailedResults.find(
(dr) =>
dr.questionText === result.input.questionText &&
dr.expected === result.expected
);

if (!detailedResult) {
// Create new detailed result
detailedResult = {
title: result.metadata?.title || '',
questionText: result.input.questionText,
answers: JSON.stringify(result.input.answers),
expected: result.expected,
"Correct Count": 0,
};
detailedResults.push(detailedResult);
}

// Add the model's output to the detailed result
detailedResult[extractedModelName] = result.output || '';

// Add to correct count if this result is correct
if (result.scores?.CorrectQuizAnswer === 1) {
detailedResult["Correct Count"] += 1;
}
});

// Add the quiz name as a tag if metadata.title is defined
// Note: in the future we should do better tagging in advance to avoid hacks like this.
const resultsWithQuizNameTag = results.map((result) => {
Expand Down Expand Up @@ -225,6 +285,13 @@ async function main() {
createCsvHeaders(experimentAggregates),
path.join(outputDir, "badge_quiz_question_experiment_aggregates.csv")
);

// Write detailed quiz results to CSV
await writeDataToCsv(
detailedResults,
createCsvHeaders(detailedResults),
path.join(outputDir, "detailed_quiz_question_results.csv")
);
}

main().catch(console.error);