diff --git a/src/__tests__/elevenlabs-v3.test.ts b/src/__tests__/elevenlabs-v3.test.ts new file mode 100644 index 0000000..81fe6ae --- /dev/null +++ b/src/__tests__/elevenlabs-v3.test.ts @@ -0,0 +1,60 @@ +import { describe, it, expect } from "@jest/globals"; +import { ElevenLabsTTSClient } from "../engines/elevenlabs"; + +describe("ElevenLabs v3 prepareText", () => { + it("strips SSML for eleven_v3 (no translation)", async () => { + const client = new ElevenLabsTTSClient({ apiKey: "test-key", modelId: "eleven_v3" }); + const ssml = 'Normal dramatic end'; + const prepared = await (client as any).prepareText(ssml, {}); + expect(prepared).toBe("Normal dramatic end"); + }); + + it("preserves native [audio tags] in plain text", async () => { + const client = new ElevenLabsTTSClient({ apiKey: "test-key", modelId: "eleven_v3" }); + const text = "Hello [excited] world [whispers]"; + const prepared = await (client as any).prepareText(text, {}); + expect(prepared).toBe(text); + }); +}); + +describe("ElevenLabs v3 request parameters", () => { + it("includes seed in payload when set", () => { + const client = new ElevenLabsTTSClient({ apiKey: "test-key" }); + const payload = (client as any).buildRequestPayload("hello", { seed: 42 }); + expect(payload.seed).toBe(42); + }); + + it("includes language_code in payload when set", () => { + const client = new ElevenLabsTTSClient({ apiKey: "test-key" }); + const payload = (client as any).buildRequestPayload("hello", { languageCode: "en" }); + expect(payload.language_code).toBe("en"); + }); + + it("includes previous_text in payload when set", () => { + const client = new ElevenLabsTTSClient({ apiKey: "test-key" }); + const payload = (client as any).buildRequestPayload("hello", { previousText: "Before this" }); + expect(payload.previous_text).toBe("Before this"); + }); + + it("includes next_text in payload when set", () => { + const client = new ElevenLabsTTSClient({ apiKey: "test-key" }); + const payload = (client as any).buildRequestPayload("hello", { nextText: "After this" }); + expect(payload.next_text).toBe("After this"); + }); + + it("includes apply_text_normalization in payload when set", () => { + const client = new ElevenLabsTTSClient({ apiKey: "test-key" }); + const payload = (client as any).buildRequestPayload("hello", { applyTextNormalization: "off" }); + expect(payload.apply_text_normalization).toBe("off"); + }); + + it("omits v3 params from payload when not set", () => { + const client = new ElevenLabsTTSClient({ apiKey: "test-key" }); + const payload = (client as any).buildRequestPayload("hello", {}); + expect(payload.seed).toBeUndefined(); + expect(payload.language_code).toBeUndefined(); + expect(payload.previous_text).toBeUndefined(); + expect(payload.next_text).toBeUndefined(); + expect(payload.apply_text_normalization).toBeUndefined(); + }); +}); diff --git a/src/engines/elevenlabs.ts b/src/engines/elevenlabs.ts index fe70528..858de12 100644 --- a/src/engines/elevenlabs.ts +++ b/src/engines/elevenlabs.ts @@ -7,7 +7,9 @@ import { getFetch } from "../utils/fetch-utils"; const fetch = getFetch(); /** - * Extended options for ElevenLabs TTS + * Extended options for ElevenLabs TTS. + * seed, languageCode, previousText, nextText, and applyTextNormalization are + * only honoured by the eleven_v3 model and are silently ignored by others. */ export interface ElevenLabsTTSOptions extends SpeakOptions { format?: "mp3" | "wav"; // Define formats supported by this client logic (maps to pcm) @@ -17,6 +19,11 @@ export interface ElevenLabsTTSOptions extends SpeakOptions { outputFormat?: string; // Override output_format per request voiceSettings?: Record; // Override voice_settings per request requestOptions?: Record; // Additional request payload overrides + seed?: number; // Deterministic output — same seed produces the same audio + languageCode?: string; // Force language interpretation (e.g. "en") + previousText?: string; // Context for continuity between sequential requests + nextText?: string; // Context for continuity between sequential requests + applyTextNormalization?: "auto" | "on" | "off"; // Control spelling/number expansion } /** @@ -65,6 +72,8 @@ export interface ElevenLabsTimestampResponse { * ElevenLabs TTS client */ export class ElevenLabsTTSClient extends AbstractTTSClient { + private static readonly MODEL_V3 = "eleven_v3"; + private static readonly DEFAULT_MODEL = "eleven_multilingual_v2"; /** * ElevenLabs API key */ @@ -98,7 +107,9 @@ export class ElevenLabsTTSClient extends AbstractTTSClient { super(credentials); this.apiKey = credentials.apiKey || process.env.ELEVENLABS_API_KEY || ""; this.modelId = - (credentials as any).modelId || (credentials as any).model || "eleven_multilingual_v2"; + (credentials as any).modelId || + (credentials as any).model || + ElevenLabsTTSClient.DEFAULT_MODEL; if (typeof (credentials as any).outputFormat === "string") { this.outputFormat = (credentials as any).outputFormat; @@ -259,6 +270,13 @@ export class ElevenLabsTTSClient extends AbstractTTSClient { merged.output_format = this.resolveOutputFormat(options, merged); merged.voice_settings = this.resolveVoiceSettings(options, merged); + if (options?.seed !== undefined) merged.seed = options.seed; + if (options?.languageCode) merged.language_code = options.languageCode; + if (options?.previousText) merged.previous_text = options.previousText; + if (options?.nextText) merged.next_text = options.nextText; + if (options?.applyTextNormalization) + merged.apply_text_normalization = options.applyTextNormalization; + return merged; } @@ -434,24 +452,17 @@ export class ElevenLabsTTSClient extends AbstractTTSClient { } /** - * Prepare text for synthesis by stripping SSML tags - * @param text Text to prepare - * @param options Synthesis options - * @returns Prepared text + * Prepare text for synthesis by stripping SSML tags. + * ElevenLabs does not support SSML — use native [audio tags] for v3 expressiveness. */ - private async prepareText(text: string, options?: SpeakOptions): Promise { + private async prepareText(text: string, options?: ElevenLabsTTSOptions): Promise { let processedText = text; - // Convert from Speech Markdown if requested if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) { - // Convert to SSML first, then strip SSML tags - // Use "elevenlabs" platform for ElevenLabs-specific Speech Markdown features const ssml = await SpeechMarkdown.toSSML(processedText, "elevenlabs"); - processedText = this._stripSSML(ssml); + processedText = ssml; } - // If text is SSML, strip the tags as ElevenLabs doesn't support SSML - // and has its own emotion analysis if (this._isSSML(processedText)) { processedText = this._stripSSML(processedText); }