diff --git a/sdks/typescript/CHANGELOG.md b/sdks/typescript/CHANGELOG.md index 143338f..257faf5 100644 --- a/sdks/typescript/CHANGELOG.md +++ b/sdks/typescript/CHANGELOG.md @@ -2,6 +2,15 @@ All notable changes to the `@learning-commons/evaluators` TypeScript SDK will be documented in this file. +## [0.3.0] — 2026-03-20 + +### Added + +- **Conventionality Evaluator** — evaluates how explicit, literal, and straightforward a text's meaning is versus how abstract, ironic, figurative, or archaic it is, relative to grades 3–12. +- **Conventionality added to TextComplexityEvaluator** — composite evaluator now runs vocabulary, sentence structure, SMK, and conventionality in parallel; result includes `conventionality` key. + +--- + ## [0.2.0] — 2026-03-18 ### Added diff --git a/sdks/typescript/README.md b/sdks/typescript/README.md index cf73044..c82d120 100644 --- a/sdks/typescript/README.md +++ b/sdks/typescript/README.md @@ -180,9 +180,71 @@ console.log(result._internal.identified_topics); // ["hydraulics", "propulsion", --- -### 4. Text Complexity Evaluator +### 4. Conventionality Evaluator -Composite evaluator that analyzes vocabulary, sentence structure, and subject matter knowledge complexity in parallel. +Evaluates how explicit, literal, and straightforward a text's meaning is versus how abstract, ironic, figurative, or archaic it is for the target grade level. Based on the Common Core Qualitative Text Complexity Rubric. + +**Supported Grades:** 3-12 + +**Uses:** Google Gemini 3 Flash Preview + +**Constructor:** +```typescript +const evaluator = new ConventionalityEvaluator({ + googleApiKey?: string; // Google API key (required by this evaluator) + maxRetries?: number; // Optional - Max retry attempts (default: 2) + telemetry?: boolean | TelemetryOptions; // Optional (default: true) + logger?: Logger; // Optional - Custom logger + logLevel?: LogLevel; // Optional - Logging verbosity (default: WARN) +}); +``` + +**API:** +```typescript +await evaluator.evaluate(text: string, grade: string) +``` + +**Returns:** +```typescript +{ + score: 'Slightly complex' | 'Moderately complex' | 'Very complex' | 'Exceedingly complex'; + reasoning: string; + metadata: { + model: string; + processingTimeMs: number; + }; + _internal: { + conventionality_features: string[]; + grade_context: string; + instructional_insights: string; + complexity_score: 'Slightly complex' | 'Moderately complex' | 'Very complex' | 'Exceedingly complex'; + reasoning: string; + }; +} +``` + +**Example:** +```typescript +import { ConventionalityEvaluator } from '@learning-commons/evaluators'; + +const evaluator = new ConventionalityEvaluator({ + googleApiKey: process.env.GOOGLE_API_KEY, +}); + +const result = await evaluator.evaluate( + "The author uses sustained irony to critique societal norms throughout the passage.", + "10" +); +console.log(result.score); // "Very complex" +console.log(result.reasoning); +console.log(result._internal.conventionality_features); // ["sustained irony", ...] +``` + +--- + +### 5. Text Complexity Evaluator + +Composite evaluator that analyzes vocabulary, sentence structure, subject matter knowledge, and conventionality complexity in parallel. **Supported Grades:** 3-12 @@ -211,10 +273,11 @@ await evaluator.evaluate(text: string, grade: string) vocabulary: EvaluationResult | { error: Error }; sentenceStructure: EvaluationResult | { error: Error }; subjectMatterKnowledge: EvaluationResult | { error: Error }; + conventionality: EvaluationResult | { error: Error }; } ``` -Each sub-evaluator result is either a full `EvaluationResult` or `{ error: Error }` if that evaluator failed. An error is only thrown if all three fail. +Each sub-evaluator result is either a full `EvaluationResult` or `{ error: Error }` if that evaluator failed. An error is only thrown if all four fail. **Example:** ```typescript @@ -236,11 +299,14 @@ if (!('error' in result.sentenceStructure)) { if (!('error' in result.subjectMatterKnowledge)) { console.log('Subject matter knowledge:', result.subjectMatterKnowledge.score); } +if (!('error' in result.conventionality)) { + console.log('Conventionality:', result.conventionality.score); +} ``` --- -### 5. Grade Level Appropriateness Evaluator +### 6. Grade Level Appropriateness Evaluator Determines appropriate grade level for text. @@ -388,6 +454,7 @@ interface BaseEvaluatorConfig { - **Vocabulary**: Requires both `googleApiKey` and `openaiApiKey` - **Sentence Structure**: Requires `openaiApiKey` only - **Subject Matter Knowledge**: Requires `googleApiKey` only +- **Conventionality**: Requires `googleApiKey` only - **Text Complexity**: Requires both `googleApiKey` and `openaiApiKey` - **Grade Level Appropriateness**: Requires `googleApiKey` only diff --git a/sdks/typescript/package.json b/sdks/typescript/package.json index d293235..7e67685 100644 --- a/sdks/typescript/package.json +++ b/sdks/typescript/package.json @@ -1,6 +1,6 @@ { "name": "@learning-commons/evaluators", - "version": "0.1.0", + "version": "0.3.0", "description": "TypeScript SDK for Learning Commons educational evaluators", "type": "module", "types": "./dist/index.d.ts", diff --git a/sdks/typescript/src/evaluators/conventionality.ts b/sdks/typescript/src/evaluators/conventionality.ts new file mode 100644 index 0000000..ffdd9d9 --- /dev/null +++ b/sdks/typescript/src/evaluators/conventionality.ts @@ -0,0 +1,229 @@ +import type { LLMProvider } from '../providers/index.js'; +import { createProvider } from '../providers/index.js'; +import { ConventionalityOutputSchema, type ConventionalityInternal } from '../schemas/conventionality.js'; +import { calculateFleschKincaidGrade } from '../features/index.js'; +import { getSystemPrompt, getUserPrompt } from '../prompts/conventionality/index.js'; +import type { EvaluationResult, TextComplexityLevel } from '../schemas/index.js'; +import { BaseEvaluator, type BaseEvaluatorConfig } from './base.js'; +import type { StageDetail } from '../telemetry/index.js'; +import { ValidationError, wrapProviderError } from '../errors.js'; + +/** + * Conventionality Evaluator + * + * Evaluates how explicit, literal, and straightforward a text's meaning is versus + * how abstract, ironic, figurative, or archaic it is for the target grade level. + * + * Based on the Common Core Qualitative Text Complexity Rubric with 4 levels: + * - Slightly complex + * - Moderately complex + * - Very complex + * - Exceedingly complex + * + * @example + * ```typescript + * const evaluator = new ConventionalityEvaluator({ + * googleApiKey: process.env.GOOGLE_API_KEY + * }); + * + * const result = await evaluator.evaluate(text, "6"); + * console.log(result.score); // "Moderately complex" + * console.log(result.reasoning); + * ``` + */ +export class ConventionalityEvaluator extends BaseEvaluator { + static readonly metadata = { + id: 'conventionality', + name: 'Conventionality', + description: 'Evaluates how explicit, literal, and straightforward a text\'s meaning is relative to grade level', + supportedGrades: ['3', '4', '5', '6', '7', '8', '9', '10', '11', '12'] as const, + requiresGoogleKey: true, + requiresOpenAIKey: false, + }; + + private provider: LLMProvider; + + constructor(config: BaseEvaluatorConfig) { + super(config); + + this.provider = createProvider({ + type: 'google', + model: 'gemini-3-flash-preview', + apiKey: config.googleApiKey, + maxRetries: this.config.maxRetries, + }); + } + + /** + * Evaluate conventionality complexity for a given text and grade level + * + * @param text - The text to evaluate + * @param grade - The target grade level (3-12) + * @returns Evaluation result with complexity score and detailed analysis + * @throws {ValidationError} If text is empty, too short/long, or grade is invalid + * @throws {APIError} If LLM API calls fail (includes AuthenticationError, RateLimitError, NetworkError, TimeoutError) + */ + async evaluate( + text: string, + grade: string + ): Promise> { + this.logger.info('Starting Conventionality evaluation', { + evaluator: 'conventionality', + operation: 'evaluate', + grade, + textLength: text.length, + }); + + const startTime = Date.now(); + const stageDetails: StageDetail[] = []; + + try { + // Validate inputs — inside try so validation errors are telemetered. + this.validateText(text); + this.validateGrade(grade, new Set(ConventionalityEvaluator.metadata.supportedGrades)); + + this.logger.debug('Evaluating conventionality complexity', { + evaluator: 'conventionality', + operation: 'conventionality_evaluation', + }); + + const fkScore = calculateFleschKincaidGrade(text); + const response = await this.evaluateConventionality(text, grade, fkScore); + + stageDetails.push({ + stage: 'conventionality_evaluation', + provider: 'google:gemini-3-flash-preview', + latency_ms: response.latencyMs, + token_usage: { + input_tokens: response.usage.inputTokens, + output_tokens: response.usage.outputTokens, + }, + }); + + const latencyMs = Date.now() - startTime; + + // Aggregate token usage + const totalTokenUsage = { + input_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.input_tokens || 0), 0), + output_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.output_tokens || 0), 0), + }; + + const result = { + score: response.data.complexity_score, + reasoning: response.data.reasoning, + metadata: { + model: 'google:gemini-3-flash-preview', + processingTimeMs: latencyMs, + }, + _internal: response.data, + }; + + // Send success telemetry (fire-and-forget) + this.sendTelemetry({ + status: 'success', + latencyMs, + textLength: text.length, + grade, + provider: 'google:gemini-3-flash-preview', + tokenUsage: totalTokenUsage, + metadata: { + stage_details: stageDetails, + }, + inputText: text, + }).catch(() => { + // Ignore telemetry errors + }); + + this.logger.info('Conventionality evaluation completed successfully', { + evaluator: 'conventionality', + operation: 'evaluate', + grade, + score: result.score, + processingTimeMs: latencyMs, + }); + + return result; + } catch (error) { + const latencyMs = Date.now() - startTime; + + this.logger.error('Conventionality evaluation failed', { + evaluator: 'conventionality', + operation: 'evaluate', + grade, + error: error instanceof Error ? error : undefined, + processingTimeMs: latencyMs, + completedStages: stageDetails.length, + }); + + const totalTokenUsage = stageDetails.length > 0 ? { + input_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.input_tokens || 0), 0), + output_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.output_tokens || 0), 0), + } : undefined; + + this.sendTelemetry({ + status: 'error', + latencyMs, + textLength: text.length, + grade, + provider: 'google:gemini-3-flash-preview', + tokenUsage: totalTokenUsage, + errorCode: error instanceof Error ? error.name : 'UnknownError', + metadata: stageDetails.length > 0 ? { stage_details: stageDetails } : undefined, + inputText: text, + }).catch(() => { + // Ignore telemetry errors + }); + + if (error instanceof ValidationError) { + throw error; + } + + throw wrapProviderError(error, 'Conventionality evaluation failed'); + } + } + + /** + * Run the Conventionality evaluation LLM call + */ + private async evaluateConventionality( + text: string, + grade: string, + fkScore: number + ): Promise<{ data: ConventionalityInternal; usage: { inputTokens: number; outputTokens: number }; latencyMs: number }> { + const response = await this.provider.generateStructured({ + messages: [ + { role: 'system', content: getSystemPrompt() }, + { role: 'user', content: getUserPrompt(text, grade, fkScore) }, + ], + schema: ConventionalityOutputSchema, + temperature: 0, + }); + + return { + data: response.data, + usage: response.usage, + latencyMs: response.latencyMs, + }; + } +} + +/** + * Functional API for Conventionality evaluation + * + * @example + * ```typescript + * const result = await evaluateConventionality( + * "The author uses sustained irony to critique societal norms.", + * "10", + * { googleApiKey: process.env.GOOGLE_API_KEY } + * ); + * ``` + */ +export async function evaluateConventionality( + text: string, + grade: string, + config: BaseEvaluatorConfig +): Promise> { + const evaluator = new ConventionalityEvaluator(config); + return evaluator.evaluate(text, grade); +} diff --git a/sdks/typescript/src/evaluators/index.ts b/sdks/typescript/src/evaluators/index.ts index 1997c2b..dcda64a 100644 --- a/sdks/typescript/src/evaluators/index.ts +++ b/sdks/typescript/src/evaluators/index.ts @@ -25,6 +25,11 @@ export { evaluateSmk, } from './smk.js'; +export { + ConventionalityEvaluator, + evaluateConventionality, +} from './conventionality.js'; + export { TextComplexityEvaluator, evaluateTextComplexity, diff --git a/sdks/typescript/src/evaluators/text-complexity.ts b/sdks/typescript/src/evaluators/text-complexity.ts index f9ad82c..b2f3414 100644 --- a/sdks/typescript/src/evaluators/text-complexity.ts +++ b/sdks/typescript/src/evaluators/text-complexity.ts @@ -2,12 +2,14 @@ import pLimit from 'p-limit'; import { VocabularyEvaluator } from './vocabulary.js'; import { SentenceStructureEvaluator } from './sentence-structure.js'; import { SmkEvaluator } from './smk.js'; +import { ConventionalityEvaluator } from './conventionality.js'; import type { SentenceStructureInternal } from '../schemas/sentence-structure.js'; import type { BaseEvaluatorConfig } from './base.js'; import { BaseEvaluator } from './base.js'; import type { EvaluationResult, TextComplexityLevel } from '../schemas/index.js'; import type { VocabularyInternal } from '../schemas/vocabulary.js'; import type { SmkInternal } from '../schemas/smk.js'; +import type { ConventionalityInternal } from '../schemas/conventionality.js'; /** * Result map returned by TextComplexityEvaluator. @@ -17,18 +19,20 @@ export interface TextComplexityResult { vocabulary: EvaluationResult | { error: Error }; sentenceStructure: EvaluationResult | { error: Error }; subjectMatterKnowledge: EvaluationResult | { error: Error }; + conventionality: EvaluationResult | { error: Error }; } /** * Text Complexity Evaluator * - * Composite evaluator that analyzes vocabulary, sentence structure, and subject matter knowledge. + * Composite evaluator that analyzes vocabulary, sentence structure, subject matter knowledge, and conventionality. * Runs all evaluations in parallel with concurrency control to avoid rate limiting. * * Uses: * - VocabularyEvaluator (Google Gemini 2.5 Pro + OpenAI GPT-4o) * - SentenceStructureEvaluator (OpenAI GPT-4o) * - SmkEvaluator (Google Gemini 3 Flash Preview) + * - ConventionalityEvaluator (Google Gemini 3 Flash Preview) * * @example * ```typescript @@ -47,7 +51,7 @@ export class TextComplexityEvaluator extends BaseEvaluator { static readonly metadata = { id: 'text-complexity', name: 'Text Complexity', - description: 'Composite evaluator analyzing vocabulary, sentence structure, and subject matter knowledge complexity', + description: 'Composite evaluator analyzing vocabulary, sentence structure, subject matter knowledge, and conventionality complexity', supportedGrades: ['3', '4', '5', '6', '7', '8', '9', '10', '11', '12'] as const, requiresGoogleKey: true, requiresOpenAIKey: true, @@ -56,6 +60,7 @@ export class TextComplexityEvaluator extends BaseEvaluator { private vocabularyEvaluator: VocabularyEvaluator; private sentenceStructureEvaluator: SentenceStructureEvaluator; private smkEvaluator: SmkEvaluator; + private conventionalityEvaluator: ConventionalityEvaluator; private limit: ReturnType; constructor(config: BaseEvaluatorConfig) { @@ -66,6 +71,7 @@ export class TextComplexityEvaluator extends BaseEvaluator { this.vocabularyEvaluator = new VocabularyEvaluator(config); this.sentenceStructureEvaluator = new SentenceStructureEvaluator(config); this.smkEvaluator = new SmkEvaluator(config); + this.conventionalityEvaluator = new ConventionalityEvaluator(config); // Create concurrency limiter (max 3 concurrent operations) this.limit = pLimit(3); @@ -99,27 +105,31 @@ export class TextComplexityEvaluator extends BaseEvaluator { const startTime = Date.now(); // Run all evaluators in parallel with concurrency control - const [vocabResult, sentenceResult, smkResult]: [ + const [vocabResult, sentenceResult, smkResult, conventionalityResult]: [ EvaluationResult | { error: Error }, EvaluationResult | { error: Error }, EvaluationResult | { error: Error }, + EvaluationResult | { error: Error }, ] = await Promise.all([ this.limit(() => this.runSubEvaluator(this.vocabularyEvaluator, text, grade)), this.limit(() => this.runSubEvaluator(this.sentenceStructureEvaluator, text, grade)), this.limit(() => this.runSubEvaluator(this.smkEvaluator, text, grade)), + this.limit(() => this.runSubEvaluator(this.conventionalityEvaluator, text, grade)), ]); const latencyMs = Date.now() - startTime; const vocabFailed = 'error' in vocabResult; const sentenceFailed = 'error' in sentenceResult; const smkFailed = 'error' in smkResult; - const hasFailures = vocabFailed || sentenceFailed || smkFailed; + const conventionalityFailed = 'error' in conventionalityResult; + const hasFailures = vocabFailed || sentenceFailed || smkFailed || conventionalityFailed; if (hasFailures) { const errors: string[] = []; if (vocabFailed) errors.push(`Vocabulary: ${vocabResult.error.message}`); if (sentenceFailed) errors.push(`Sentence structure: ${sentenceResult.error.message}`); if (smkFailed) errors.push(`Subject matter knowledge: ${smkResult.error.message}`); + if (conventionalityFailed) errors.push(`Conventionality: ${conventionalityResult.error.message}`); this.logger.error('Text complexity evaluation completed with errors', { evaluator: 'text-complexity', @@ -129,7 +139,7 @@ export class TextComplexityEvaluator extends BaseEvaluator { processingTimeMs: latencyMs, }); - if (vocabFailed && sentenceFailed && smkFailed) { + if (vocabFailed && sentenceFailed && smkFailed && conventionalityFailed) { throw new Error(`Text complexity evaluation failed: ${errors.join('; ')}`); } } @@ -155,7 +165,7 @@ export class TextComplexityEvaluator extends BaseEvaluator { hasFailures, }); - return { vocabulary: vocabResult, sentenceStructure: sentenceResult, subjectMatterKnowledge: smkResult }; + return { vocabulary: vocabResult, sentenceStructure: sentenceResult, subjectMatterKnowledge: smkResult, conventionality: conventionalityResult }; } /** diff --git a/sdks/typescript/src/index.ts b/sdks/typescript/src/index.ts index f1c4cf7..63eb691 100644 --- a/sdks/typescript/src/index.ts +++ b/sdks/typescript/src/index.ts @@ -52,6 +52,9 @@ export type { VocabularyInternal } from './schemas/vocabulary.js'; // Subject Matter Knowledge exports export type { SmkInternal } from './schemas/smk.js'; +// Conventionality exports +export type { ConventionalityInternal } from './schemas/conventionality.js'; + // Grade Level Appropriateness exports export type { GradeLevelAppropriatenessInternal } from './schemas/grade-level-appropriateness.js'; @@ -64,6 +67,8 @@ export { evaluateSentenceStructure, SmkEvaluator, evaluateSmk, + ConventionalityEvaluator, + evaluateConventionality, GradeLevelAppropriatenessEvaluator, evaluateGradeLevelAppropriateness, TextComplexityEvaluator, diff --git a/sdks/typescript/src/prompts/conventionality/index.ts b/sdks/typescript/src/prompts/conventionality/index.ts new file mode 100644 index 0000000..71af2bb --- /dev/null +++ b/sdks/typescript/src/prompts/conventionality/index.ts @@ -0,0 +1,22 @@ +import SYSTEM_PROMPT from '../../../../../evals/prompts/conventionality/system.txt'; +import USER_PROMPT_TEMPLATE from '../../../../../evals/prompts/conventionality/user.txt'; + +/** + * Get the Conventionality evaluator system prompt + */ +export function getSystemPrompt(): string { + return SYSTEM_PROMPT; +} + +/** + * Generate the user prompt for Conventionality evaluation + * @param text - The text to evaluate + * @param grade - The target grade level + * @param fkScore - Flesch-Kincaid grade level score + */ +export function getUserPrompt(text: string, grade: string, fkScore: number): string { + return USER_PROMPT_TEMPLATE + .replaceAll('{text}', text) + .replaceAll('{grade}', grade) + .replaceAll('{fk_score}', fkScore.toString()); +} diff --git a/sdks/typescript/src/schemas/conventionality.ts b/sdks/typescript/src/schemas/conventionality.ts new file mode 100644 index 0000000..a8f74d2 --- /dev/null +++ b/sdks/typescript/src/schemas/conventionality.ts @@ -0,0 +1,15 @@ +import { z } from 'zod'; +import { TextComplexityLevel } from './outputs.js'; + +/** + * Conventionality evaluation output schema + */ +export const ConventionalityOutputSchema = z.object({ + conventionality_features: z.array(z.string()).describe('The specific language features driving the complexity (e.g., literal narrative, concrete actions, sustained irony, abstract qualities) with direct quotes from the text.'), + grade_context: z.string().describe('How the conventionality demands compare to general expectations for the provided target grade.'), + instructional_insights: z.string().describe('Actionable pedagogical suggestions for scaffolding the conventionality features in the classroom.'), + complexity_score: TextComplexityLevel.describe('The conventionality complexity level of the text'), + reasoning: z.string().describe('A detailed explanation of the rating, citing specific features in the text and referencing the expert guardrails.'), +}); + +export type ConventionalityInternal = z.infer; diff --git a/sdks/typescript/tests/integration/conventionality.integration.test.ts b/sdks/typescript/tests/integration/conventionality.integration.test.ts new file mode 100644 index 0000000..a5f5e64 --- /dev/null +++ b/sdks/typescript/tests/integration/conventionality.integration.test.ts @@ -0,0 +1,191 @@ +import { describe, it, expect, beforeAll } from 'vitest'; +import { ConventionalityEvaluator } from '../../src/evaluators/conventionality.js'; +import { + runEvaluatorTest, + type BaseTestCase, +} from '../utils/index.js'; + +/** + * Conventionality Evaluator Integration Tests + * + * Test cases cover grades 3-12, one per grade, drawn from the annotated dataset + * (dataset_conventionality.csv). complexity_score values from the dataset (snake_case) + * are mapped to the evaluator's Title Case format. + * + * Complexity distribution across test cases: + * - Slightly complex: Grades 4, 8 + * - Moderately complex: Grades 5, 11, 12 + * - Very complex: Grades 6, 7, 9, 10 + * - Exceedingly complex: Grade 3 + * + * Each test uses a retry mechanism (up to 3 attempts) to account for LLM non-determinism, + * with short-circuiting on first expected match. If no expected match is found after all + * attempts, the test checks if any result falls within the acceptable (adjacent) value range. + * + * To run these tests: + * ```bash + * RUN_INTEGRATION_TESTS=true npm run test:integration + * ``` + */ + +const SKIP_INTEGRATION = !process.env.RUN_INTEGRATION_TESTS && + !process.env.GOOGLE_API_KEY; + +const describeIntegration = SKIP_INTEGRATION ? describe.skip : describe; + +// Test timeout: 2 minutes per test case (allows for 3 attempts with API latency) +const TEST_TIMEOUT_MS = 2 * 60 * 1000; + +// One test case per grade (3–12), drawn from dataset_conventionality.csv. +// Expected values are the dataset's complexity_score mapped to Title Case. +// Acceptable values are the adjacent complexity levels on the 4-point scale. +const TEST_CASES: BaseTestCase[] = [ + { + id: 'CONV3', + grade: '3', + // dataset item 5915, annotated: exceedingly_complex + // Abstract rhetoric, possible irony, and figurative language for Grade 3 readers + text: "May I say a few words? I want to say a few words about this matter. I am a woman's rights. I have as much muscle as any man, and can do as much work as any man. I have plowed and reaped and husked and chopped and mowed, and can any man do more than that? I have heard much about the sexes being equal; I can carry as much as any man, and can eat as much too, if I can get it. I am as strong as any man that is now. As for intellect, all I can say is, if women have a pint and man a quart – why can't she have her little pint full? You need not be afraid to give us our rights for fear we will take too much, for we can't take more than our pint'll hold. The poor men seem to be all in confusion, and don't know what to do. Why children, if you have woman's rights, give it to her and you will feel better. You will have your own rights, and they won't be so much trouble. I can't read, but I can hear.", + expected: 'Exceedingly complex', + acceptable: ['Very complex'], + }, + { + id: 'CONV4', + grade: '4', + // dataset item 3314, annotated: slightly_complex + // Largely explicit, literal explanatory text well within Grade 4 reach + text: 'Look up at the sky. In many places you will see clouds. There are many different types of clouds. They are all different shapes and sizes. Some clouds are fluffy, while others are wispy. Some are big and others are small. Some even resemble familiar shapes. Have you ever wondered how clouds are formed? Clouds are made of evaporated water. Evaporation is when water changes from liquid to gas. Water evaporates from different sources all around you, like lakes, rivers, and the ocean. Can you guess the main source of water for clouds? The main source is the ocean. This is because the ocean makes up such a large part of the world. Seventy-one percent of our earth is covered by ocean. Water evaporates and becomes gas. This gas rises and mixes with particles in the air. It rises and rises until it cools and collects in one part of the sky. This forms a cloud. The kind of cloud that forms depends on the environment. Different clouds form at different heights. They change depending on the temperature, too. There are three major types of clouds: cirrus clouds, stratus clouds, and cumulus clouds. Each type of cloud looks different.', + expected: 'Slightly complex', + acceptable: ['Moderately complex'], + }, + { + id: 'CONV5', + grade: '5', + // dataset item 4257, annotated: moderately_complex + // Some archaic phrasing and borderline figurative language ("zest of his everyday life") + text: 'A few days ago I sat by the bedside of a wounded sapper\u2014a reservist\u2014and heard the story of life in a signal-box on a branch line in the North of England. The man was dying. I think he knew it. But the zest of his everyday life was still strong in him. He described the manner in which, on leaving the army originally, he had obtained his post on the railway. He told me that there were three trains each way in the day, and mentioned that on Winter nights the last train was frequently very late. This meant a late supper, but his wife saw to it that everything was kept hot. Sometimes his wife came to the box to meet him if it was a dry night. In the next bed there was a young Scotsman from a Highland district which I know very well. We were friends so soon as he learned that I knew his home. He was a roadman, and we talked of his roads and the changes which had been wrought in them of late years by motor traffic.', + expected: 'Moderately complex', + acceptable: ['Slightly complex', 'Very complex'], + }, + { + id: 'CONV6', + grade: '6', + // dataset item 4336, annotated: very_complex + // Abstract national qualities, rhetorical idealization, and figurative framing throughout + text: "The whole world recognizes two qualities in the Englishman: his bravery and his common sense. We know that the Englishman is true to his given word, and that even in the antipodes he never changes his habits. As I write, the postman brings me a letter from the front, dated Oct. 17. The cavalryman who sends it tells of our Allies. 'We are fighting the enemy's cavalry,' he writes, 'and for two days my brigade was in action with the British. They know how to fight and they astonish us by their marvelous powers of organization and their coolness.' Yes, we know that of old. We also know that England never closes her doors to liberty. We have a confused memory of the hospitality given to our priests in the times of the Revolution. Now England provides us with fresh proof of her kindness of heart. You have heard the news\u2013the professors and students of the Catholic University of Louvain invited to Cambridge. The destroyed Belgian university reconstituted in the home of the celebrated English university. What a magnificent idea!", + expected: 'Very complex', + acceptable: ['Moderately complex', 'Exceedingly complex'], + }, + { + id: 'CONV7', + grade: '7', + // dataset item 7385, annotated: very_complex + // Figurative and abstract language appears regularly; requires interpretation of tone and implied themes + text: "Our attention was called to the fact that there was 'practicing' going on, and we could, at 8:07, see quick flashes. That these flashes pointed directly at Scarborough we did not for a few minutes comprehend. Then, the fog slowly lifting, we saw a fog that was partly smoke. The castle grew into its place in the six miles distance. It seemed for a moment that the eight-foot-thick Norman walls tottered; but no, whatever tottered was behind the keep. Curiously enough we could barely hear the cannonading, for the wind was keen in the opposite direction, yet we could, as the minutes crept by and the air cleared, see distinctly the flashes from the boats and the flashes in the city. After about fifteen minutes there was a cessation, or perhaps a hesitation, that lasted two minutes; then the flashes continued.", + expected: 'Very complex', + acceptable: ['Moderately complex', 'Exceedingly complex'], + }, + { + id: 'CONV8', + grade: '8', + // dataset item 6178, annotated: slightly_complex + // Mostly explicit biographical narrative; historical context adds difficulty but language is largely literal + text: "At Knob Creek the boy began to go to an 'A B C' school. His first teacher was Zachariah Riney. Of course, there were no regular schools in the backwoods then. When a man who 'knew enough' happened to come along, especially if he had nothing else to do, he tried to teach the children of the pioneers in a poor log schoolhouse. It is not likely that little Abe went to school more than a few weeks at this time, for he never had a year's schooling in his life. There was another teacher afterward at Knob Creek\u2013a man named Caleb Hazel. Little is known of either of these teachers except that he taught little Abe Lincoln. If their pupil had not become famous the men and their schools would never have been mentioned in history. An old man, named Austin Gollaher, used to like to tell of the days when he and little Abe went to school together. He said: 'Abe was an unusually bright boy at school, and made splendid progress in his studies. Indeed, he learned faster than any of his schoolmates. Though so young, he studied very hard.'", + expected: 'Slightly complex', + acceptable: ['Moderately complex'], + }, + { + id: 'CONV9', + grade: '9', + // dataset item 5369, annotated: very_complex + // Abstract reasoning and conceptual framing; metaphors central to meaning; implicit rebuttal structure + text: 'No one at this moment knows what electricity is; but for our present purpose we may regard it as a fluid, non-elastic, and without weight, and universally diffused through the universe. To judge by recently published statements, a large section of the reading public are taught that this fluid is a source of power, and that it may be made to do the work of coal. This is a delusion. So long as electricity remains in what we may call a normal state of repose, it is inert. Before we can get any work out of electricity a somewhat greater amount of work must be done upon it. If this fundamental and most important truth be kept in view it will not be easy to make a grave mistake in estimating the value of any of the numerous schemes for making electricity do work which will ere long be brought before the public.', + expected: 'Very complex', + acceptable: ['Moderately complex', 'Exceedingly complex'], + }, + { + id: 'CONV10', + grade: '10', + // dataset item 4382, annotated: very_complex + // Sustained irony and implicit comparison (savage/civilized); abstract social critique throughout + text: "I know of no savage custom or habit of thought which has not its mate in civilized countries. For every mischievous or absurd practice of the natural man I can name you one of ours that is essentially the same. And nearly every custom of our barbarian ancestors in historic times persists in some form today. We make ourselves look formidable in battle--for that matter, we fight. Our women paint their faces. We feel it obligatory to dress more or less alike, inventing the most ingenious reasons for doing so and actually despising and persecuting those who do not care to conform. Almost within the memory of living persons bearded men were stoned in the streets; and a clergyman in New York who wore his beard as Christ wore his, was put into jail and variously persecuted till he died.", + expected: 'Very complex', + acceptable: ['Moderately complex', 'Exceedingly complex'], + }, + { + id: 'CONV11', + grade: '11', + // dataset item 4323, annotated: moderately_complex + // Vocabulary-driven complexity; abstract political/ideological language with occasional irony but clear argument + text: "But the question, Why are we at war? can be answered fairly well by anybody conversant with the facts of the European situation. We are not at war because the Emperor, as war lord, has sent out word to his legions to begin a war of world-wide aggression, carrying into its vortex intellectual Germany, notwithstanding all her peaceful aspirations. I may fairly claim to be a representative of that intellectual Germany which comes in now for a good deal of sympathy, but I must own that intellectual Germany, as far as I know about her, thoroughly approves of the Emperor's present policy. She approves of it not on the principle merely 'Right or wrong, my country'; she does so because she knows that war has become inevitable, and that we must face that ordeal when we are ready for it, not at the moment most agreeable to our enemies.", + expected: 'Moderately complex', + acceptable: ['Slightly complex', 'Very complex'], + }, + { + id: 'CONV12', + grade: '12', + // dataset item 5855, annotated: moderately_complex + // A few abstract turns of phrase; demand mainly from vocabulary and sentence structure rather than conventionality + text: "Here, the idlers of the place assemble to lounge and gossip, to look out for any outward-bound ships that are to be seen in the Channel, and to criticise the appearance and glorify the capabilities of the little fleet of Looe fishing-boats, riding snugly at anchor before them at the entrance of the bay. The inhabitants number some fourteen hundred; and are as good-humoured and unsophisticated a set of people as you will meet with anywhere. The Fisheries and the Coast Trade form their principal means of subsistence. The women take a very fair share of the hard work out of the men's hands. You constantly see them carrying coals from the vessels to the quay in curious hand-barrows: they laugh, scream, and run in each other's way incessantly: but these little irregularities seem to assist, rather than impede them, in the prosecution of their tasks. As to the men, one absorbing interest appears to govern them all. The whole day long they are mending boats, painting boats, cleaning boats, rowing boats, or, standing with their hands in their pockets, looking at boats.", + expected: 'Moderately complex', + acceptable: ['Slightly complex', 'Very complex'], + }, +]; + +describeIntegration.concurrent('Conventionality Evaluator - Comprehensive Test Suite', () => { + let evaluator: ConventionalityEvaluator; + + beforeAll(() => { + if (SKIP_INTEGRATION) { + console.log('⏭️ Skipping integration tests (no API keys or RUN_INTEGRATION_TESTS not set)'); + return; + } + + evaluator = new ConventionalityEvaluator({ + googleApiKey: process.env.GOOGLE_API_KEY!, + }); + + console.log('\n' + '='.repeat(80)); + console.log('CONVENTIONALITY EVALUATOR - TEST SUITE (PARALLEL)'); + console.log('='.repeat(80)); + console.log(`Running ${TEST_CASES.length} test cases with up to 3 attempts each`); + console.log('Short-circuiting on first expected match'); + console.log('Checking acceptable values if no expected match'); + console.log('='.repeat(80)); + }); + + // Generate individual test for each case + TEST_CASES.forEach((testCase) => { + it.concurrent(`${testCase.id}: Grade ${testCase.grade} - ${testCase.expected}`, async () => { + // Buffer all logs to print atomically at the end (prevents interleaving in parallel tests) + const logBuffer: string[] = []; + + // Test header + logBuffer.push('\n' + '='.repeat(80)); + logBuffer.push(`Test Case ${testCase.id} | Grade: ${testCase.grade}`); + logBuffer.push('='.repeat(80)); + logBuffer.push(`Expected Complexity: ${testCase.expected}`); + logBuffer.push(`Text Preview: ${testCase.text.substring(0, 100)}...`); + logBuffer.push(''); + + // Run the evaluation (returns logs instead of printing) + const maxAttempts = 3; + const result = await runEvaluatorTest(testCase, { + evaluator, + extractResult: (r) => r.score, + maxAttempts, + }); + + // Add evaluation logs to buffer (includes detailed summary) + logBuffer.push(...result.logs); + + // Print all logs atomically at the end - single console.log to prevent interleaving + console.log(logBuffer.join('\n')); + + // Assert that we got a match within maxAttempts (expected or acceptable) + expect(result.matched).toBe(true); + expect(result.matchedOnAttempt).toBeDefined(); + expect(result.matchedOnAttempt).toBeLessThanOrEqual(maxAttempts); + }, TEST_TIMEOUT_MS); + }); +}); diff --git a/sdks/typescript/tests/unit/evaluators/conventionality.test.ts b/sdks/typescript/tests/unit/evaluators/conventionality.test.ts new file mode 100644 index 0000000..5894187 --- /dev/null +++ b/sdks/typescript/tests/unit/evaluators/conventionality.test.ts @@ -0,0 +1,154 @@ +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import { ConventionalityEvaluator } from '../../../src/evaluators/conventionality.js'; +import type { LLMProvider } from '../../../src/providers/base.js'; + +// Mock providers +const createMockProvider = (): LLMProvider => ({ + generateStructured: vi.fn(), + generateText: vi.fn(), +}); + +vi.mock('../../../src/providers/index.js', () => ({ + createProvider: vi.fn(() => createMockProvider()), +})); + +vi.mock('../../../src/telemetry/client.js', () => ({ + TelemetryClient: class MockTelemetryClient { + send = vi.fn().mockResolvedValue(undefined); + }, +})); + +describe('ConventionalityEvaluator - Constructor Validation', () => { + it('should throw with specific message when Google API key is missing', () => { + expect(() => new ConventionalityEvaluator({ googleApiKey: '' })).toThrow( + 'Google API key is required for Conventionality evaluator. Pass googleApiKey in config.' + ); + }); +}); + +describe('ConventionalityEvaluator - Metadata', () => { + it('should have correct metadata', () => { + expect(ConventionalityEvaluator.metadata.id).toBe('conventionality'); + expect(ConventionalityEvaluator.metadata.name).toBe('Conventionality'); + expect(ConventionalityEvaluator.metadata.requiresGoogleKey).toBe(true); + expect(ConventionalityEvaluator.metadata.requiresOpenAIKey).toBe(false); + expect(ConventionalityEvaluator.metadata.supportedGrades).toEqual([ + '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', + ]); + }); +}); + +describe('ConventionalityEvaluator - Evaluation Flow', () => { + let evaluator: ConventionalityEvaluator; + let mockProvider: LLMProvider; + + beforeEach(() => { + vi.clearAllMocks(); + + evaluator = new ConventionalityEvaluator({ + googleApiKey: 'test-google-key', + telemetry: false, + }); + + // @ts-expect-error Accessing private property for testing + mockProvider = evaluator.provider; + }); + + afterEach(() => { + vi.restoreAllMocks(); + }); + + it('should map LLM response to result, call provider once, include text+grade in prompt, use temperature 0', async () => { + const testText = 'The author uses sustained irony throughout to critique the hypocrisy of civilized society by comparing it to so-called primitive customs.'; + const testGrade = '10'; + + vi.mocked(mockProvider.generateStructured).mockResolvedValue({ + data: { + conventionality_features: [ + 'sustained irony: "I know of no savage custom or habit of thought which has not its mate in civilized countries"', + 'implicit comparison requiring inference', + ], + grade_context: 'Sustained irony and abstract social critique exceed typical Grade 10 expectations.', + instructional_insights: 'Pre-teach the rhetorical device of irony; discuss the author\'s implied argument before reading.', + complexity_score: 'Very complex', + reasoning: 'The text relies heavily on sustained irony and implicit comparisons that require readers to infer the author\'s critical stance.', + }, + model: 'gemini-3-flash-preview', + usage: { inputTokens: 250, outputTokens: 120 }, + latencyMs: 900, + }); + + const result = await evaluator.evaluate(testText, testGrade); + + expect(result.score).toBe('Very complex'); + expect(result.reasoning).toContain('irony'); + expect(result.metadata.model).toBe('google:gemini-3-flash-preview'); + expect(result.metadata.processingTimeMs).toBeGreaterThanOrEqual(0); + + expect(mockProvider.generateStructured).toHaveBeenCalledTimes(1); + const call = vi.mocked(mockProvider.generateStructured).mock.calls[0]; + expect(call[0].messages[1].content).toContain(testText); + expect(call[0].messages[1].content).toContain(testGrade); + expect(call[0].schema).toBeDefined(); + expect(call[0].temperature).toBe(0); + }); + + it('should propagate LLM API errors', async () => { + vi.mocked(mockProvider.generateStructured).mockRejectedValue(new Error('API timeout')); + + await expect( + evaluator.evaluate('The cat sat on the mat outside.', '5') + ).rejects.toThrow('API timeout'); + }); + + it('should not call provider when input validation fails', async () => { + await expect(evaluator.evaluate('', '5')).rejects.toThrow(); + expect(mockProvider.generateStructured).not.toHaveBeenCalled(); + }); + + it('should pass all _internal fields through from LLM response', async () => { + const mockData = { + conventionality_features: ['literal narrative', 'concrete actions'], + grade_context: 'Text uses mostly literal, accessible language for Grade 5.', + instructional_insights: 'No special scaffolding needed for conventionality.', + complexity_score: 'Slightly complex' as const, + reasoning: 'The text is largely explicit and literal with minimal figurative language.', + }; + + vi.mocked(mockProvider.generateStructured).mockResolvedValue({ + data: mockData, + model: 'gemini-3-flash-preview', + usage: { inputTokens: 180, outputTokens: 90 }, + latencyMs: 700, + }); + + const result = await evaluator.evaluate('Clouds form when water evaporates and rises into the sky.', '5'); + + expect(result._internal).toEqual(mockData); + expect(result._internal?.conventionality_features).toEqual(['literal narrative', 'concrete actions']); + expect(result._internal?.grade_context).toBe('Text uses mostly literal, accessible language for Grade 5.'); + expect(result._internal?.instructional_insights).toBe('No special scaffolding needed for conventionality.'); + }); + + it('should include fk_score in user prompt', async () => { + vi.mocked(mockProvider.generateStructured).mockResolvedValue({ + data: { + conventionality_features: ['figurative language'], + grade_context: 'Appropriate for grade 7.', + instructional_insights: 'Discuss metaphors before reading.', + complexity_score: 'Moderately complex', + reasoning: 'Some figurative language present.', + }, + model: 'gemini-3-flash-preview', + usage: { inputTokens: 200, outputTokens: 100 }, + latencyMs: 800, + }); + + const testText = 'The minutes crept by as the castle grew into its place in the fog-shrouded distance.'; + await evaluator.evaluate(testText, '7'); + + const call = vi.mocked(mockProvider.generateStructured).mock.calls[0]; + // The user prompt should contain the FK score (a number) + expect(call[0].messages[1].content).toMatch(/\d+(\.\d+)?/); + }); +}); diff --git a/sdks/typescript/tests/unit/evaluators/text-complexity.test.ts b/sdks/typescript/tests/unit/evaluators/text-complexity.test.ts index a9bf9ad..7304cb9 100644 --- a/sdks/typescript/tests/unit/evaluators/text-complexity.test.ts +++ b/sdks/typescript/tests/unit/evaluators/text-complexity.test.ts @@ -97,6 +97,7 @@ describe('TextComplexityEvaluator', () => { let vocabSpy: any; let sentenceSpy: any; let smkSpy: any; + let conventionalitySpy: any; beforeEach(() => { evaluator = new TextComplexityEvaluator({ @@ -135,6 +136,16 @@ describe('TextComplexityEvaluator', () => { }, _internal: {}, }); + + conventionalitySpy = vi.spyOn((evaluator as any).conventionalityEvaluator, 'evaluate').mockResolvedValue({ + score: 'Moderately complex', + reasoning: 'Conventionality test reasoning', + metadata: { + model: 'google:gemini-3-flash-preview', + processingTimeMs: 100, + }, + _internal: {}, + }); }); afterEach(() => { @@ -151,9 +162,11 @@ describe('TextComplexityEvaluator', () => { expect(result.vocabulary).toBeDefined(); expect(result.sentenceStructure).toBeDefined(); expect(result.subjectMatterKnowledge).toBeDefined(); + expect(result.conventionality).toBeDefined(); expect('error' in result.vocabulary).toBe(false); expect('error' in result.sentenceStructure).toBe(false); expect('error' in result.subjectMatterKnowledge).toBe(false); + expect('error' in result.conventionality).toBe(false); }); it('should validate text input', async () => { @@ -192,7 +205,7 @@ describe('TextComplexityEvaluator', () => { } }); - it('should run all three evaluators in parallel', async () => { + it('should run all four evaluators in parallel', async () => { const text = 'The cat sat on the mat.'; const grade = '5'; @@ -201,13 +214,14 @@ describe('TextComplexityEvaluator', () => { const duration = Date.now() - startTime; // With mocked providers that take ~100ms each, parallel execution should be faster than sequential - // Sequential would be ~300ms, parallel should be ~100ms - // Allow some overhead but should be significantly less than 300ms - expect(duration).toBeLessThan(300); + // Sequential would be ~400ms, parallel should be ~100ms + // Allow some overhead but should be significantly less than 400ms + expect(duration).toBeLessThan(400); expect('error' in result.vocabulary).toBe(false); expect('error' in result.sentenceStructure).toBe(false); expect('error' in result.subjectMatterKnowledge).toBe(false); + expect('error' in result.conventionality).toBe(false); }); it('should handle partial failures gracefully', async () => { @@ -224,9 +238,10 @@ describe('TextComplexityEvaluator', () => { expect((result.vocabulary as { error: Error }).error).toBeDefined(); expect('error' in result.sentenceStructure).toBe(false); expect('error' in result.subjectMatterKnowledge).toBe(false); + expect('error' in result.conventionality).toBe(false); }); - it('should throw when all three evaluators fail', async () => { + it('should throw when all four evaluators fail', async () => { const text = 'The cat sat on the mat.'; const grade = '5'; @@ -234,12 +249,29 @@ describe('TextComplexityEvaluator', () => { vocabSpy.mockRejectedValue(new Error('Vocabulary evaluation failed')); sentenceSpy.mockRejectedValue(new Error('Sentence structure evaluation failed')); smkSpy.mockRejectedValue(new Error('SMK evaluation failed')); + conventionalitySpy.mockRejectedValue(new Error('Conventionality evaluation failed')); await expect(evaluator.evaluate(text, grade)).rejects.toThrow( 'Text complexity evaluation failed' ); }); + it('should handle conventionality failure while others succeed', async () => { + const text = 'The cat sat on the mat.'; + const grade = '5'; + + conventionalitySpy.mockRejectedValue(new Error('Conventionality evaluation failed')); + + const result = await evaluator.evaluate(text, grade); + + expect(result).toBeDefined(); + expect('error' in result.conventionality).toBe(true); + expect((result.conventionality as { error: Error }).error).toBeDefined(); + expect('error' in result.vocabulary).toBe(false); + expect('error' in result.sentenceStructure).toBe(false); + expect('error' in result.subjectMatterKnowledge).toBe(false); + }); + it('should determine overall complexity correctly', async () => { const text = 'The cat sat on the mat.'; const grade = '5'; @@ -277,11 +309,23 @@ describe('TextComplexityEvaluator', () => { _internal: {}, }); + // Override conventionality to return "Exceedingly complex" + conventionalitySpy.mockResolvedValue({ + score: 'Exceedingly complex', + reasoning: 'Conventionality reasoning', + metadata: { + model: 'google:gemini-3-flash-preview', + processingTimeMs: 100, + }, + _internal: {}, + }); + const result = await evaluator.evaluate(text, grade); expect('error' in result.vocabulary).toBe(false); expect('error' in result.sentenceStructure).toBe(false); expect('error' in result.subjectMatterKnowledge).toBe(false); + expect('error' in result.conventionality).toBe(false); if (!('error' in result.vocabulary)) { expect(result.vocabulary.score).toBe('Moderately complex'); } @@ -291,6 +335,9 @@ describe('TextComplexityEvaluator', () => { if (!('error' in result.subjectMatterKnowledge)) { expect(result.subjectMatterKnowledge.score).toBe('Very complex'); } + if (!('error' in result.conventionality)) { + expect(result.conventionality.score).toBe('Exceedingly complex'); + } }); it('should preserve individual sub-evaluator reasoning', async () => { @@ -328,6 +375,16 @@ describe('TextComplexityEvaluator', () => { _internal: {}, }); + conventionalitySpy.mockResolvedValue({ + score: 'Moderately complex', + reasoning: 'This is the conventionality reasoning.', + metadata: { + model: 'google:gemini-3-flash-preview', + processingTimeMs: 100, + }, + _internal: {}, + }); + const result = await evaluator.evaluate(text, grade); if (!('error' in result.vocabulary)) { @@ -339,6 +396,9 @@ describe('TextComplexityEvaluator', () => { if (!('error' in result.subjectMatterKnowledge)) { expect(result.subjectMatterKnowledge.reasoning).toBe('This is the SMK reasoning.'); } + if (!('error' in result.conventionality)) { + expect(result.conventionality.reasoning).toBe('This is the conventionality reasoning.'); + } }); });