From dbad36d891451fbdaf9f19c1572a8dd5fc8a741b Mon Sep 17 00:00:00 2001 From: Owen McGirr Date: Fri, 10 Apr 2026 20:59:58 +0100 Subject: [PATCH 1/3] feat: ElevenLabs v3 support (audio tags, new request params) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add _ssmlToV3AudioTags() to translate SSML to v3 inline audio tags: emphasis strong/moderate → [excited], reduced → [whispers], break → [pause] - prepareText() now routes to tag translation for eleven_v3, strips for all other models - Expose seed, languageCode, previousText, nextText, applyTextNormalization in ElevenLabsTTSOptions - Include v3 params in buildRequestPayload() when set - Add 16 unit tests covering tag translation and new request params Closes #36 --- src/__tests__/elevenlabs-v3.test.ts | 133 ++++++++++++++++++++++++++++ src/engines/elevenlabs.ts | 78 ++++++++++++++-- 2 files changed, 203 insertions(+), 8 deletions(-) create mode 100644 src/__tests__/elevenlabs-v3.test.ts diff --git a/src/__tests__/elevenlabs-v3.test.ts b/src/__tests__/elevenlabs-v3.test.ts new file mode 100644 index 0000000..b46253a --- /dev/null +++ b/src/__tests__/elevenlabs-v3.test.ts @@ -0,0 +1,133 @@ +import { describe, it, expect } from "@jest/globals"; +import { ElevenLabsTTSClient } from "../engines/elevenlabs"; + +// Access private method for unit testing +function ssmlToV3AudioTags(client: ElevenLabsTTSClient, ssml: string): string { + return (client as any)._ssmlToV3AudioTags(ssml); +} + +describe("ElevenLabs v3 audio tag translation", () => { + const client = new ElevenLabsTTSClient({ apiKey: "test-key" }); + + it("translates strong emphasis to [excited]", () => { + const result = ssmlToV3AudioTags( + client, + "Hello world" + ); + expect(result).toBe("Hello world [excited]"); + }); + + it("translates moderate emphasis to [excited]", () => { + const result = ssmlToV3AudioTags( + client, + "hello" + ); + expect(result).toBe("hello [excited]"); + }); + + it("translates reduced emphasis to [whispers]", () => { + const result = ssmlToV3AudioTags( + client, + "quiet" + ); + expect(result).toBe("quiet [whispers]"); + }); + + it("translates emphasis without level to [excited]", () => { + const result = ssmlToV3AudioTags( + client, + "hey" + ); + expect(result).toBe("hey [excited]"); + }); + + it("translates to [pause]", () => { + const result = ssmlToV3AudioTags( + client, + "Helloworld" + ); + expect(result).toBe("Hello[pause]world"); + }); + + it("strips prosody tags but keeps content", () => { + const result = ssmlToV3AudioTags( + client, + "take it easy" + ); + expect(result).toBe("take it easy"); + }); + + it("strips speak wrapper", () => { + const result = ssmlToV3AudioTags(client, "plain text"); + expect(result).toBe("plain text"); + }); + + it("preserves existing [audio tags] in plain text through prepareText", async () => { + const v3Client = new ElevenLabsTTSClient({ apiKey: "test-key", modelId: "eleven_v3" }); + // Plain text with audio tags should pass through unchanged + const text = "Hello [excited] world [whispers]"; + const prepared = await (v3Client as any).prepareText(text, {}); + expect(prepared).toBe(text); + }); + + it("strips SSML for non-v3 models", async () => { + const v2Client = new ElevenLabsTTSClient({ + apiKey: "test-key", + modelId: "eleven_multilingual_v2", + }); + const ssml = 'hello'; + const prepared = await (v2Client as any).prepareText(ssml, {}); + // Should strip all tags, no [excited] added + expect(prepared).toBe("hello"); + expect(prepared).not.toContain("[excited]"); + }); + + it("translates SSML to audio tags for eleven_v3 model", async () => { + const v3Client = new ElevenLabsTTSClient({ apiKey: "test-key", modelId: "eleven_v3" }); + const ssml = 'Normal dramatic end'; + const prepared = await (v3Client as any).prepareText(ssml, {}); + expect(prepared).toBe("Normal dramatic [excited] end"); + }); +}); + +describe("ElevenLabs v3 request parameters", () => { + it("includes seed in payload when set", () => { + const client = new ElevenLabsTTSClient({ apiKey: "test-key" }); + const payload = (client as any).buildRequestPayload("hello", { seed: 42 }); + expect(payload.seed).toBe(42); + }); + + it("includes language_code in payload when set", () => { + const client = new ElevenLabsTTSClient({ apiKey: "test-key" }); + const payload = (client as any).buildRequestPayload("hello", { languageCode: "en" }); + expect(payload.language_code).toBe("en"); + }); + + it("includes previous_text in payload when set", () => { + const client = new ElevenLabsTTSClient({ apiKey: "test-key" }); + const payload = (client as any).buildRequestPayload("hello", { previousText: "Before this" }); + expect(payload.previous_text).toBe("Before this"); + }); + + it("includes next_text in payload when set", () => { + const client = new ElevenLabsTTSClient({ apiKey: "test-key" }); + const payload = (client as any).buildRequestPayload("hello", { nextText: "After this" }); + expect(payload.next_text).toBe("After this"); + }); + + it("includes apply_text_normalization in payload when set", () => { + const client = new ElevenLabsTTSClient({ apiKey: "test-key" }); + const payload = (client as any).buildRequestPayload("hello", { applyTextNormalization: "off" }); + expect(payload.apply_text_normalization).toBe("off"); + }); + + it("omits v3 params from payload when not set", () => { + const client = new ElevenLabsTTSClient({ apiKey: "test-key" }); + const payload = (client as any).buildRequestPayload("hello", {}); + expect(payload.seed).toBeUndefined(); + expect(payload.language_code).toBeUndefined(); + expect(payload.previous_text).toBeUndefined(); + expect(payload.next_text).toBeUndefined(); + expect(payload.apply_text_normalization).toBeUndefined(); + }); +}); diff --git a/src/engines/elevenlabs.ts b/src/engines/elevenlabs.ts index fe70528..453f01d 100644 --- a/src/engines/elevenlabs.ts +++ b/src/engines/elevenlabs.ts @@ -17,6 +17,12 @@ export interface ElevenLabsTTSOptions extends SpeakOptions { outputFormat?: string; // Override output_format per request voiceSettings?: Record; // Override voice_settings per request requestOptions?: Record; // Additional request payload overrides + // v3-specific parameters + seed?: number; // Seed for deterministic output (eleven_v3+) + languageCode?: string; // Force language interpretation (e.g. "en") + previousText?: string; // Context for continuity between requests + nextText?: string; // Context for continuity between requests + applyTextNormalization?: "auto" | "on" | "off"; // Control spelling/number expansion } /** @@ -259,6 +265,14 @@ export class ElevenLabsTTSClient extends AbstractTTSClient { merged.output_format = this.resolveOutputFormat(options, merged); merged.voice_settings = this.resolveVoiceSettings(options, merged); + // v3 optional parameters + if (options?.seed !== undefined) merged.seed = options.seed; + if (options?.languageCode) merged.language_code = options.languageCode; + if (options?.previousText) merged.previous_text = options.previousText; + if (options?.nextText) merged.next_text = options.nextText; + if (options?.applyTextNormalization) + merged.apply_text_normalization = options.applyTextNormalization; + return merged; } @@ -434,26 +448,74 @@ export class ElevenLabsTTSClient extends AbstractTTSClient { } /** - * Prepare text for synthesis by stripping SSML tags + * Translate SSML to ElevenLabs v3 audio tags (best-effort, lossy). + * eleven_v3 uses inline [tag] annotations instead of SSML markup. + * + * Mappings: + * → …[excited] + * → …[whispers] + * → …[excited] + * → [pause] + * → … (content kept, tags stripped) + * All other SSML tags → content kept, tags stripped + */ + private _ssmlToV3AudioTags(ssml: string): string { + let text = ssml; + + // Unwrap wrapper + text = text.replace(/<\/?speak[^>]*>/gi, ""); + + // → … [excited] + text = text.replace( + /]*>([\s\S]*?)<\/emphasis>/gi, + (_match, _level, content) => `${content.trim()} [excited]` + ); + + // → … [whispers] + text = text.replace( + /]*>([\s\S]*?)<\/emphasis>/gi, + (_match, content) => `${content.trim()} [whispers]` + ); + + // without level → … [excited] + text = text.replace( + /]*>([\s\S]*?)<\/emphasis>/gi, + (_match, content) => `${content.trim()} [excited]` + ); + + // → [pause] + text = text.replace(//gi, "[pause]"); + + // Strip remaining SSML tags, preserving content + text = this._stripSSML(text); + + return text.trim(); + } + + /** + * Prepare text for synthesis, handling SSML and v3 audio tags. * @param text Text to prepare * @param options Synthesis options * @returns Prepared text */ - private async prepareText(text: string, options?: SpeakOptions): Promise { + private async prepareText(text: string, options?: ElevenLabsTTSOptions): Promise { let processedText = text; // Convert from Speech Markdown if requested if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) { - // Convert to SSML first, then strip SSML tags - // Use "elevenlabs" platform for ElevenLabs-specific Speech Markdown features const ssml = await SpeechMarkdown.toSSML(processedText, "elevenlabs"); - processedText = this._stripSSML(ssml); + processedText = ssml; } - // If text is SSML, strip the tags as ElevenLabs doesn't support SSML - // and has its own emotion analysis if (this._isSSML(processedText)) { - processedText = this._stripSSML(processedText); + const modelId = this.resolveModelId(options); + if (modelId === "eleven_v3") { + // Translate SSML to v3 audio tags where possible + processedText = this._ssmlToV3AudioTags(processedText); + } else { + // Non-v3 models: strip SSML entirely (ElevenLabs ignores it) + processedText = this._stripSSML(processedText); + } } return processedText; From cb9c8d84312aa192d13a64c0e147891437918007 Mon Sep 17 00:00:00 2001 From: Owen McGirr Date: Sat, 11 Apr 2026 08:51:13 +0100 Subject: [PATCH 2/3] refactor: extract model ID constants, fix unused capture group, improve JSDoc --- src/engines/elevenlabs.ts | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/engines/elevenlabs.ts b/src/engines/elevenlabs.ts index 453f01d..a0c8c20 100644 --- a/src/engines/elevenlabs.ts +++ b/src/engines/elevenlabs.ts @@ -7,7 +7,9 @@ import { getFetch } from "../utils/fetch-utils"; const fetch = getFetch(); /** - * Extended options for ElevenLabs TTS + * Extended options for ElevenLabs TTS. + * seed, languageCode, previousText, nextText, and applyTextNormalization are + * only honoured by the eleven_v3 model and are silently ignored by others. */ export interface ElevenLabsTTSOptions extends SpeakOptions { format?: "mp3" | "wav"; // Define formats supported by this client logic (maps to pcm) @@ -17,11 +19,10 @@ export interface ElevenLabsTTSOptions extends SpeakOptions { outputFormat?: string; // Override output_format per request voiceSettings?: Record; // Override voice_settings per request requestOptions?: Record; // Additional request payload overrides - // v3-specific parameters - seed?: number; // Seed for deterministic output (eleven_v3+) + seed?: number; // Deterministic output — same seed produces the same audio languageCode?: string; // Force language interpretation (e.g. "en") - previousText?: string; // Context for continuity between requests - nextText?: string; // Context for continuity between requests + previousText?: string; // Context for continuity between sequential requests + nextText?: string; // Context for continuity between sequential requests applyTextNormalization?: "auto" | "on" | "off"; // Control spelling/number expansion } @@ -71,6 +72,8 @@ export interface ElevenLabsTimestampResponse { * ElevenLabs TTS client */ export class ElevenLabsTTSClient extends AbstractTTSClient { + private static readonly MODEL_V3 = "eleven_v3"; + private static readonly DEFAULT_MODEL = "eleven_multilingual_v2"; /** * ElevenLabs API key */ @@ -104,7 +107,9 @@ export class ElevenLabsTTSClient extends AbstractTTSClient { super(credentials); this.apiKey = credentials.apiKey || process.env.ELEVENLABS_API_KEY || ""; this.modelId = - (credentials as any).modelId || (credentials as any).model || "eleven_multilingual_v2"; + (credentials as any).modelId || + (credentials as any).model || + ElevenLabsTTSClient.DEFAULT_MODEL; if (typeof (credentials as any).outputFormat === "string") { this.outputFormat = (credentials as any).outputFormat; @@ -265,7 +270,6 @@ export class ElevenLabsTTSClient extends AbstractTTSClient { merged.output_format = this.resolveOutputFormat(options, merged); merged.voice_settings = this.resolveVoiceSettings(options, merged); - // v3 optional parameters if (options?.seed !== undefined) merged.seed = options.seed; if (options?.languageCode) merged.language_code = options.languageCode; if (options?.previousText) merged.previous_text = options.previousText; @@ -467,8 +471,8 @@ export class ElevenLabsTTSClient extends AbstractTTSClient { // → … [excited] text = text.replace( - /]*>([\s\S]*?)<\/emphasis>/gi, - (_match, _level, content) => `${content.trim()} [excited]` + /]*>([\s\S]*?)<\/emphasis>/gi, + (_match, content) => `${content.trim()} [excited]` ); // → … [whispers] @@ -509,7 +513,7 @@ export class ElevenLabsTTSClient extends AbstractTTSClient { if (this._isSSML(processedText)) { const modelId = this.resolveModelId(options); - if (modelId === "eleven_v3") { + if (modelId === ElevenLabsTTSClient.MODEL_V3) { // Translate SSML to v3 audio tags where possible processedText = this._ssmlToV3AudioTags(processedText); } else { From 964dc39884562fa5805ebdf4514838ba85ae6130 Mon Sep 17 00:00:00 2001 From: Owen McGirr Date: Sat, 11 Apr 2026 09:03:59 +0100 Subject: [PATCH 3/3] simplify: remove SSML-to-audio-tag translation, strip SSML for all models --- src/__tests__/elevenlabs-v3.test.ts | 91 +++-------------------------- src/engines/elevenlabs.ts | 61 +------------------ 2 files changed, 12 insertions(+), 140 deletions(-) diff --git a/src/__tests__/elevenlabs-v3.test.ts b/src/__tests__/elevenlabs-v3.test.ts index b46253a..81fe6ae 100644 --- a/src/__tests__/elevenlabs-v3.test.ts +++ b/src/__tests__/elevenlabs-v3.test.ts @@ -1,93 +1,20 @@ import { describe, it, expect } from "@jest/globals"; import { ElevenLabsTTSClient } from "../engines/elevenlabs"; -// Access private method for unit testing -function ssmlToV3AudioTags(client: ElevenLabsTTSClient, ssml: string): string { - return (client as any)._ssmlToV3AudioTags(ssml); -} - -describe("ElevenLabs v3 audio tag translation", () => { - const client = new ElevenLabsTTSClient({ apiKey: "test-key" }); - - it("translates strong emphasis to [excited]", () => { - const result = ssmlToV3AudioTags( - client, - "Hello world" - ); - expect(result).toBe("Hello world [excited]"); - }); - - it("translates moderate emphasis to [excited]", () => { - const result = ssmlToV3AudioTags( - client, - "hello" - ); - expect(result).toBe("hello [excited]"); - }); - - it("translates reduced emphasis to [whispers]", () => { - const result = ssmlToV3AudioTags( - client, - "quiet" - ); - expect(result).toBe("quiet [whispers]"); - }); - - it("translates emphasis without level to [excited]", () => { - const result = ssmlToV3AudioTags( - client, - "hey" - ); - expect(result).toBe("hey [excited]"); - }); - - it("translates to [pause]", () => { - const result = ssmlToV3AudioTags( - client, - "Helloworld" - ); - expect(result).toBe("Hello[pause]world"); - }); - - it("strips prosody tags but keeps content", () => { - const result = ssmlToV3AudioTags( - client, - "take it easy" - ); - expect(result).toBe("take it easy"); - }); - - it("strips speak wrapper", () => { - const result = ssmlToV3AudioTags(client, "plain text"); - expect(result).toBe("plain text"); +describe("ElevenLabs v3 prepareText", () => { + it("strips SSML for eleven_v3 (no translation)", async () => { + const client = new ElevenLabsTTSClient({ apiKey: "test-key", modelId: "eleven_v3" }); + const ssml = 'Normal dramatic end'; + const prepared = await (client as any).prepareText(ssml, {}); + expect(prepared).toBe("Normal dramatic end"); }); - it("preserves existing [audio tags] in plain text through prepareText", async () => { - const v3Client = new ElevenLabsTTSClient({ apiKey: "test-key", modelId: "eleven_v3" }); - // Plain text with audio tags should pass through unchanged + it("preserves native [audio tags] in plain text", async () => { + const client = new ElevenLabsTTSClient({ apiKey: "test-key", modelId: "eleven_v3" }); const text = "Hello [excited] world [whispers]"; - const prepared = await (v3Client as any).prepareText(text, {}); + const prepared = await (client as any).prepareText(text, {}); expect(prepared).toBe(text); }); - - it("strips SSML for non-v3 models", async () => { - const v2Client = new ElevenLabsTTSClient({ - apiKey: "test-key", - modelId: "eleven_multilingual_v2", - }); - const ssml = 'hello'; - const prepared = await (v2Client as any).prepareText(ssml, {}); - // Should strip all tags, no [excited] added - expect(prepared).toBe("hello"); - expect(prepared).not.toContain("[excited]"); - }); - - it("translates SSML to audio tags for eleven_v3 model", async () => { - const v3Client = new ElevenLabsTTSClient({ apiKey: "test-key", modelId: "eleven_v3" }); - const ssml = 'Normal dramatic end'; - const prepared = await (v3Client as any).prepareText(ssml, {}); - expect(prepared).toBe("Normal dramatic [excited] end"); - }); }); describe("ElevenLabs v3 request parameters", () => { diff --git a/src/engines/elevenlabs.ts b/src/engines/elevenlabs.ts index a0c8c20..858de12 100644 --- a/src/engines/elevenlabs.ts +++ b/src/engines/elevenlabs.ts @@ -452,74 +452,19 @@ export class ElevenLabsTTSClient extends AbstractTTSClient { } /** - * Translate SSML to ElevenLabs v3 audio tags (best-effort, lossy). - * eleven_v3 uses inline [tag] annotations instead of SSML markup. - * - * Mappings: - * → …[excited] - * → …[whispers] - * → …[excited] - * → [pause] - * → … (content kept, tags stripped) - * All other SSML tags → content kept, tags stripped - */ - private _ssmlToV3AudioTags(ssml: string): string { - let text = ssml; - - // Unwrap wrapper - text = text.replace(/<\/?speak[^>]*>/gi, ""); - - // → … [excited] - text = text.replace( - /]*>([\s\S]*?)<\/emphasis>/gi, - (_match, content) => `${content.trim()} [excited]` - ); - - // → … [whispers] - text = text.replace( - /]*>([\s\S]*?)<\/emphasis>/gi, - (_match, content) => `${content.trim()} [whispers]` - ); - - // without level → … [excited] - text = text.replace( - /]*>([\s\S]*?)<\/emphasis>/gi, - (_match, content) => `${content.trim()} [excited]` - ); - - // → [pause] - text = text.replace(//gi, "[pause]"); - - // Strip remaining SSML tags, preserving content - text = this._stripSSML(text); - - return text.trim(); - } - - /** - * Prepare text for synthesis, handling SSML and v3 audio tags. - * @param text Text to prepare - * @param options Synthesis options - * @returns Prepared text + * Prepare text for synthesis by stripping SSML tags. + * ElevenLabs does not support SSML — use native [audio tags] for v3 expressiveness. */ private async prepareText(text: string, options?: ElevenLabsTTSOptions): Promise { let processedText = text; - // Convert from Speech Markdown if requested if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) { const ssml = await SpeechMarkdown.toSSML(processedText, "elevenlabs"); processedText = ssml; } if (this._isSSML(processedText)) { - const modelId = this.resolveModelId(options); - if (modelId === ElevenLabsTTSClient.MODEL_V3) { - // Translate SSML to v3 audio tags where possible - processedText = this._ssmlToV3AudioTags(processedText); - } else { - // Non-v3 models: strip SSML entirely (ElevenLabs ignores it) - processedText = this._stripSSML(processedText); - } + processedText = this._stripSSML(processedText); } return processedText;