From 76759e473e012325299996fbc9b71319336e6025 Mon Sep 17 00:00:00 2001 From: will wade Date: Wed, 8 Apr 2026 12:47:59 +0100 Subject: [PATCH 1/7] feat: add 9 new TTS engines (Cartesia, Deepgram, Hume, xAI, Fish Audio, Mistral, Murf, Unreal Speech, Resemble) - Cartesia: sonic-3/sonic-2 with emotion-to-SSML mapping - Deepgram: aura-2 with static voice list and streaming - Hume: octave-2/octave-1 with version mapping and streaming - xAI: grok-tts with native audio tag passthrough - Fish Audio: s2-pro with model-as-header pattern - Mistral: voxtral-mini-tts-2603 with SSE streaming - Murf: GEN2/FALCON with dual model endpoints - Unreal Speech: two-step URI + direct streaming - Resemble: base64 JSON + direct streaming - ElevenLabs: added v3 audio tag processing - 166 new tests across 12 test files - All engines registered in factory, types, and exports --- BACKLOG.md | 98 ++++++ src/__tests__/cartesia-compliance.test.ts | 145 ++++++++ src/__tests__/cartesia.test.ts | 104 ++++++ src/__tests__/deepgram-compliance.test.ts | 135 ++++++++ src/__tests__/deepgram.test.ts | 92 +++++ src/__tests__/elevenlabs-audio-tags.test.ts | 94 ++++++ src/__tests__/fishaudio.test.ts | 107 ++++++ src/__tests__/hume.test.ts | 101 ++++++ src/__tests__/mistral.test.ts | 20 ++ src/__tests__/murf.test.ts | 20 ++ src/__tests__/resemble.test.ts | 17 + src/__tests__/unrealspeech.test.ts | 18 + src/__tests__/xai.test.ts | 107 ++++++ src/browser.ts | 33 +- src/engines/cartesia.ts | 356 ++++++++++++++++++++ src/engines/deepgram.ts | 269 +++++++++++++++ src/engines/elevenlabs.ts | 41 ++- src/engines/fishaudio.ts | 268 +++++++++++++++ src/engines/hume.ts | 240 +++++++++++++ src/engines/mistral.ts | 284 ++++++++++++++++ src/engines/murf.ts | 266 +++++++++++++++ src/engines/resemble.ts | 210 ++++++++++++ src/engines/unrealspeech.ts | 243 +++++++++++++ src/engines/xai.ts | 257 ++++++++++++++ src/factory-browser.ts | 60 +++- src/factory.ts | 66 +++- src/index.ts | 54 +-- src/types.ts | 11 +- 28 files changed, 3663 insertions(+), 53 deletions(-) create mode 100644 BACKLOG.md create mode 100644 src/__tests__/cartesia-compliance.test.ts create mode 100644 src/__tests__/cartesia.test.ts create mode 100644 src/__tests__/deepgram-compliance.test.ts create mode 100644 src/__tests__/deepgram.test.ts create mode 100644 src/__tests__/elevenlabs-audio-tags.test.ts create mode 100644 src/__tests__/fishaudio.test.ts create mode 100644 src/__tests__/hume.test.ts create mode 100644 src/__tests__/mistral.test.ts create mode 100644 src/__tests__/murf.test.ts create mode 100644 src/__tests__/resemble.test.ts create mode 100644 src/__tests__/unrealspeech.test.ts create mode 100644 src/__tests__/xai.test.ts create mode 100644 src/engines/cartesia.ts create mode 100644 src/engines/deepgram.ts create mode 100644 src/engines/fishaudio.ts create mode 100644 src/engines/hume.ts create mode 100644 src/engines/mistral.ts create mode 100644 src/engines/murf.ts create mode 100644 src/engines/resemble.ts create mode 100644 src/engines/unrealspeech.ts create mode 100644 src/engines/xai.ts diff --git a/BACKLOG.md b/BACKLOG.md new file mode 100644 index 0000000..08192df --- /dev/null +++ b/BACKLOG.md @@ -0,0 +1,98 @@ +# js-tts-wrapper Engine & Feature Backlog + +Reference: [speech-sdk](https://github.com/Jellypod-Inc/speech-sdk) (`@speech-sdk/core`) + +## Completed + +- [x] Cartesia engine (`sonic-3`, `sonic-2`) with audio tag / emotion-to-SSML support +- [x] Deepgram engine (`aura-2`) with static voice list +- [x] ElevenLabs v3 audio tag passthrough (`[laugh]`, `[sigh]`, etc.) +- [x] Generic property pass-through via `properties` / `propertiesJson` +- [x] Hume engine (`octave-2`, `octave-1`) with streaming via separate `/tts/stream/file` endpoint +- [x] xAI engine (`grok-tts`) with native audio tag passthrough, language config +- [x] Fish Audio engine (`s2-pro`) with audio tag passthrough, model-as-header pattern +- [x] Mistral engine (`voxtral-mini-tts-2603`) with SSE streaming, base64 chunk parsing +- [x] Murf engine (`GEN2`, `FALCON`) with dual model/endpoints, base64 GEN2 / binary FALCON +- [x] Unreal Speech engine with two-step URI non-streaming, direct streaming +- [x] Resemble engine with base64 JSON non-streaming, direct streaming + +## New Engines to Add + +### Lower Priority (Open-Source / Niche) + +| Engine | Models | Key Features | Notes | +|--------|--------|-------------|-------| +| **fal** | `f5-tts`, `kokoro`, `dia-tts`, `orpheus-tts`, `index-tts-2` | Voice cloning, open-source | No streaming, many sub-models | +| **Google Gemini TTS** | `gemini-2.5-flash-preview-tts`, `gemini-2.5-pro-preview-tts` | Pseudo-streaming, 23 languages | Different from existing Google Cloud TTS | + +## Cross-Cutting Features + +### Audio Tags (Cross-Provider Abstraction) + +Unified `[tag]` syntax mapped to provider-specific representations: +- **ElevenLabs v3** — native passthrough (done) +- **Cartesia sonic-3** — emotions to `` SSML (done) +- **OpenAI gpt-4o-mini-tts** — tags to natural language `instructions` +- **xAI grok-tts** — native passthrough +- **Fish Audio s2-pro** — native passthrough +- **All others** — strip tags with warnings + +### Model-Level Feature Declarations + +Add per-model capability metadata (from speech-sdk pattern): +- `streaming` — supports real-time audio streaming +- `audio-tags` — supports `[tag]` syntax +- `inline-voice-cloning` — accepts reference audio inline +- `open-source` — model is open source + +Enables runtime capability checks via `hasFeature()`. + +### Unified Voice Type + +Current: engine-specific voice IDs +Proposed: `string | { url: string } | { audio: string | Uint8Array }` +- `string` — standard voice ID +- `{ url }` — voice cloning from URL +- `{ audio }` — voice cloning from inline audio + +### Voice Cloning Support + +Providers that support inline voice cloning: +- Cartesia sonic-3 +- Hume octave-2 +- Fish Audio s2-pro +- Resemble +- Mistral voxtral-mini-tts-2603 +- fal (f5-tts, dia-tts, index-tts-2) + +### Streaming Improvements + +- Cartesia: true streaming via WebSocket or SSE +- Deepgram: true streaming +- Google Gemini: pseudo-streaming (SSE base64 chunks) +- Standardize `synthToBytestream` to return actual streaming responses where supported + +### Tree-Shakeable Subpath Exports + +From speech-sdk pattern — add per-provider subpath exports in package.json: +```json +{ + "exports": { + ".": "./dist/esm/index.js", + "./cartesia": "./dist/esm/engines/cartesia.js", + "./deepgram": "./dist/esm/engines/deepgram.js" + } +} +``` + +### Unified Error Hierarchy + +Standardize errors across engines with rich context (statusCode, model, responseBody). + +## Existing Engine Updates Needed + +| Engine | Update Needed | +|--------|--------------| +| **OpenAI** | Add `gpt-4o-mini-tts` model with instructions/audio tag support | +| **Google** | Add Gemini-based TTS alongside existing Cloud TTS | +| **ElevenLabs** | Close issue #24 (already fixed) | diff --git a/src/__tests__/cartesia-compliance.test.ts b/src/__tests__/cartesia-compliance.test.ts new file mode 100644 index 0000000..073daf2 --- /dev/null +++ b/src/__tests__/cartesia-compliance.test.ts @@ -0,0 +1,145 @@ +import { describe, it, expect, jest, beforeEach } from "@jest/globals"; +import { CartesiaTTSClient } from "../engines/cartesia"; +import { createTTSClient } from "../factory"; + +describe("CartesiaTTSClient — Unified API compliance", () => { + let client: CartesiaTTSClient; + + beforeEach(() => { + client = new CartesiaTTSClient({ apiKey: "test-api-key" }); + }); + + describe("voiceId integration with base class", () => { + it("should set voiceId via setVoice (base class)", () => { + client.setVoice("new-voice-id"); + expect(client.getProperty("voice")).toBe("new-voice-id"); + }); + + it("should use voiceId from base class in synthToBytes", () => { + client.setVoice("test-voice-123"); + const opts = (client as any).voiceId; + expect(opts).toBe("test-voice-123"); + }); + + it("should fall back to default voice when voiceId is null", () => { + (client as any).voiceId = null; + expect((client as any).voiceId).toBeNull(); + }); + + it("should set voice via setProperty", () => { + client.setProperty("voice", "property-voice"); + expect(client.getProperty("voice")).toBe("property-voice"); + }); + }); + + describe("factory integration", () => { + it("should create client via factory", () => { + const c = createTTSClient("cartesia", { apiKey: "test" }); + expect(c).toBeDefined(); + expect(c).toBeInstanceOf(CartesiaTTSClient); + }); + + it("should apply properties via factory", () => { + const c = createTTSClient("cartesia", { + apiKey: "test", + properties: { model: "sonic-2" }, + }); + expect(c.getProperty("model")).toBe("sonic-2"); + }); + }); + + describe("sample rate", () => { + it("should have correct sample rate for WAV output", () => { + expect((client as any).sampleRate).toBe(44100); + }); + }); + + describe("SSML handling", () => { + it("should strip SSML before synthesis", async () => { + const result = await (client as any).prepareText( + "Hello world" + ); + expect(result).not.toContain(""); + expect(result).not.toContain(""); + expect(result).toContain("Hello"); + expect(result).toContain("world"); + }); + + it("should handle plain text without SSML", async () => { + const result = await (client as any).prepareText("Hello world"); + expect(result).toBe("Hello world"); + }); + }); + + describe("SpeechMarkdown handling", () => { + it("should convert SpeechMarkdown when option is set", async () => { + const result = await (client as any).prepareText("Hello (world)[1]", { + useSpeechMarkdown: true, + }); + expect(result).not.toContain("[1]"); + }); + }); + + describe("audio tag handling", () => { + it("should convert emotion tags to SSML for sonic-3", () => { + (client as any).model = "sonic-3"; + const result = (client as any).processAudioTags("Hello [happy] world"); + expect(result).toContain(''); + }); + + it("should strip all tags for sonic-2", () => { + (client as any).model = "sonic-2"; + const result = (client as any).processAudioTags("Hello [happy] world"); + expect(result).toBe("Hello world"); + }); + + it("should pass through laughter tags for sonic-3", () => { + (client as any).model = "sonic-3"; + const result = (client as any).processAudioTags("Hello [laughter] world"); + expect(result).toContain("[laughter]"); + }); + }); + + describe("credentials", () => { + it("should require apiKey credential", () => { + expect((client as any).getRequiredCredentials()).toEqual(["apiKey"]); + }); + + it("should return false for checkCredentials without key", async () => { + const c = new CartesiaTTSClient({}); + expect(await c.checkCredentials()).toBe(false); + }); + + it("should provide detailed credential status", async () => { + const status = await client.getCredentialStatus(); + expect(status).toHaveProperty("valid"); + expect(status).toHaveProperty("engine"); + expect(status.engine).toBe("cartesia"); + expect(status).toHaveProperty("requiresCredentials", true); + }); + }); + + describe("word boundaries", () => { + it("should create estimated word timings in synthToBytes", () => { + (client as any)._createEstimatedWordTimings("Hello world test"); + const timings = (client as any).timings; + expect(timings.length).toBe(3); + expect(timings[0][2]).toBe("Hello"); + expect(timings[1][2]).toBe("world"); + expect(timings[2][2]).toBe("test"); + }); + }); + + describe("event system", () => { + it("should support on/connect event registration", () => { + const startFn = jest.fn(); + const endFn = jest.fn(); + client.on("start", startFn); + client.connect("onEnd", endFn); + (client as any).emit("start"); + (client as any).emit("end"); + expect(startFn).toHaveBeenCalled(); + expect(endFn).toHaveBeenCalled(); + }); + }); +}); diff --git a/src/__tests__/cartesia.test.ts b/src/__tests__/cartesia.test.ts new file mode 100644 index 0000000..c490c3b --- /dev/null +++ b/src/__tests__/cartesia.test.ts @@ -0,0 +1,104 @@ +import { describe, it, expect, jest, beforeEach } from "@jest/globals"; +import { CartesiaTTSClient } from "../engines/cartesia"; + +describe("CartesiaTTSClient", () => { + let client: CartesiaTTSClient; + + beforeEach(() => { + client = new CartesiaTTSClient({ apiKey: "test-api-key" }); + }); + + it("should initialize with default values", () => { + expect(client).toBeDefined(); + expect(client.getProperty("model")).toBe("sonic-3"); + }); + + it("should initialize with custom model via credentials", () => { + const c = new CartesiaTTSClient({ apiKey: "test", model: "sonic-2" }); + expect(c.getProperty("model")).toBe("sonic-2"); + }); + + it("should initialize with custom model via properties", () => { + const c = new CartesiaTTSClient({ + apiKey: "test", + properties: { model: "sonic-2" }, + }); + expect(c.getProperty("model")).toBe("sonic-2"); + }); + + it("should initialize with custom model via JSON properties", () => { + const c = new CartesiaTTSClient({ + apiKey: "test", + propertiesJson: '{"model":"sonic-2"}', + }); + expect(c.getProperty("model")).toBe("sonic-2"); + }); + + it("should set and get model", () => { + client.setProperty("model", "sonic-2"); + expect(client.getProperty("model")).toBe("sonic-2"); + }); + + it("should set and get voice", () => { + client.setProperty("voice", "test-voice-id"); + expect(client.getProperty("voice")).toBe("test-voice-id"); + }); + + it("should set and get outputFormat", () => { + const fmt = { container: "mp3", bit_rate: 128000 }; + client.setProperty("outputFormat", fmt); + expect(client.getProperty("outputFormat")).toEqual(fmt); + }); + + it("should return false for checkCredentials without api key", async () => { + const c = new CartesiaTTSClient({}); + expect(await c.checkCredentials()).toBe(false); + }); + + it("should get static voices", async () => { + const voices = await client.getVoices(); + expect(voices).toBeDefined(); + expect(Array.isArray(voices)).toBe(true); + }); + + it("should process audio tags for sonic-3 model without hitting API", () => { + (client as any).model = "sonic-3"; + const result = (client as any).processAudioTags("Hello [laughter] world"); + expect(result).toContain("[laughter]"); + }); + + it("should throw on synthToBytes with bad api key", async () => { + const c = new CartesiaTTSClient({ apiKey: "bad-key" }); + await expect(c.synthToBytes("Hello")).rejects.toThrow(); + }); + + it("should throw on synthToBytestream with bad api key", async () => { + const c = new CartesiaTTSClient({ apiKey: "bad-key" }); + await expect(c.synthToBytestream("Hello")).rejects.toThrow(); + }); + + it("should process emotion tags for sonic-3", () => { + (client as any).model = "sonic-3"; + const result = (client as any).processAudioTags("Hello [happy] world"); + expect(result).toContain(''); + }); + + it("should pass through laughter tag for sonic-3", () => { + (client as any).model = "sonic-3"; + const result = (client as any).processAudioTags("Hello [laughter] world"); + expect(result).toContain("[laughter]"); + }); + + it("should strip unsupported tags for sonic-3", () => { + (client as any).model = "sonic-3"; + const result = (client as any).processAudioTags("Hello [bizarre_tag] world"); + expect(result).not.toContain("[bizarre_tag]"); + }); + + it("should strip all tags for non-sonic-3 models", () => { + (client as any).model = "sonic-2"; + const result = (client as any).processAudioTags("Hello [happy] world"); + expect(result).not.toContain("[happy]"); + expect(result).not.toContain(" { + let client: DeepgramTTSClient; + + beforeEach(() => { + client = new DeepgramTTSClient({ apiKey: "test-api-key" }); + }); + + describe("voiceId integration with base class", () => { + it("should set voiceId via setVoice (base class)", () => { + client.setVoice("aura-2-stella-en"); + expect(client.getProperty("voice")).toBe("aura-2-stella-en"); + }); + + it("should set voice via setProperty", () => { + client.setProperty("voice", "aura-2-apollo-en"); + expect(client.getProperty("voice")).toBe("aura-2-apollo-en"); + }); + }); + + describe("factory integration", () => { + it("should create client via factory", () => { + const c = createTTSClient("deepgram", { apiKey: "test" }); + expect(c).toBeDefined(); + expect(c).toBeInstanceOf(DeepgramTTSClient); + }); + + it("should apply properties via factory", () => { + const c = createTTSClient("deepgram", { + apiKey: "test", + properties: { model: "aura" }, + }); + expect(c.getProperty("model")).toBe("aura"); + }); + }); + + describe("SSML handling", () => { + it("should strip SSML before synthesis", async () => { + const result = await (client as any).prepareText( + "Hello world" + ); + expect(result).not.toContain(""); + expect(result).not.toContain(" { + const result = await (client as any).prepareText("Hello world"); + expect(result).toBe("Hello world"); + }); + }); + + describe("SpeechMarkdown handling", () => { + it("should handle text without speech markdown unchanged", async () => { + const result = await (client as any).prepareText("Hello world"); + expect(result).toBe("Hello world"); + }); + }); + + describe("credentials", () => { + it("should require apiKey credential", () => { + expect((client as any).getRequiredCredentials()).toEqual(["apiKey"]); + }); + + it("should return false for checkCredentials without key", async () => { + const c = new DeepgramTTSClient({}); + expect(await c.checkCredentials()).toBe(false); + }); + + it("should provide detailed credential status", async () => { + const status = await client.getCredentialStatus(); + expect(status).toHaveProperty("valid"); + expect(status).toHaveProperty("engine"); + expect(status.engine).toBe("deepgram"); + expect(status).toHaveProperty("requiresCredentials", true); + }); + }); + + describe("unified voices", () => { + it("should return voices with UnifiedVoice shape", async () => { + const voices = await client.getVoices(); + expect(voices.length).toBeGreaterThan(0); + for (const v of voices) { + expect(v).toHaveProperty("id"); + expect(v).toHaveProperty("name"); + expect(v).toHaveProperty("gender"); + expect(v).toHaveProperty("languageCodes"); + expect(v).toHaveProperty("provider"); + expect(v.provider).toBe("deepgram"); + expect(v.languageCodes.length).toBeGreaterThan(0); + expect(v.languageCodes[0]).toHaveProperty("bcp47"); + expect(v.languageCodes[0]).toHaveProperty("iso639_3"); + expect(v.languageCodes[0]).toHaveProperty("display"); + } + }); + + it("should filter voices by language", async () => { + const voices = await client.getVoicesByLanguage("en-US"); + expect(voices.length).toBeGreaterThan(0); + for (const v of voices) { + expect( + v.languageCodes.some((l) => l.bcp47 === "en-US" || l.iso639_3 === "eng") + ).toBe(true); + } + }); + }); + + describe("word boundaries", () => { + it("should create estimated word timings in synthToBytes", () => { + (client as any)._createEstimatedWordTimings("Hello world test"); + const timings = (client as any).timings; + expect(timings.length).toBe(3); + expect(timings[0][2]).toBe("Hello"); + expect(timings[1][2]).toBe("world"); + expect(timings[2][2]).toBe("test"); + }); + }); + + describe("event system", () => { + it("should support on/connect event registration", () => { + const startFn = jest.fn(); + const endFn = jest.fn(); + client.on("start", startFn); + client.connect("onEnd", endFn); + (client as any).emit("start"); + (client as any).emit("end"); + expect(startFn).toHaveBeenCalled(); + expect(endFn).toHaveBeenCalled(); + }); + }); +}); diff --git a/src/__tests__/deepgram.test.ts b/src/__tests__/deepgram.test.ts new file mode 100644 index 0000000..1788530 --- /dev/null +++ b/src/__tests__/deepgram.test.ts @@ -0,0 +1,92 @@ +import { describe, it, expect, beforeEach } from "@jest/globals"; +import { DeepgramTTSClient } from "../engines/deepgram"; + +describe("DeepgramTTSClient", () => { + let client: DeepgramTTSClient; + + beforeEach(() => { + client = new DeepgramTTSClient({ apiKey: "test-api-key" }); + }); + + it("should initialize with default values", () => { + expect(client).toBeDefined(); + expect(client.getProperty("model")).toBe("aura-2"); + }); + + it("should initialize with custom model via credentials", () => { + const c = new DeepgramTTSClient({ apiKey: "test", model: "aura" }); + expect(c.getProperty("model")).toBe("aura"); + }); + + it("should initialize with custom model via properties", () => { + const c = new DeepgramTTSClient({ + apiKey: "test", + properties: { model: "aura" }, + }); + expect(c.getProperty("model")).toBe("aura"); + }); + + it("should initialize with custom model via JSON properties", () => { + const c = new DeepgramTTSClient({ + apiKey: "test", + propertiesJson: '{"model":"aura"}', + }); + expect(c.getProperty("model")).toBe("aura"); + }); + + it("should set and get model", () => { + client.setProperty("model", "aura"); + expect(client.getProperty("model")).toBe("aura"); + }); + + it("should set and get voice", () => { + client.setProperty("voice", "aura-2-stella-en"); + expect(client.getProperty("voice")).toBe("aura-2-stella-en"); + }); + + it("should return false for checkCredentials without api key", async () => { + const c = new DeepgramTTSClient({}); + expect(await c.checkCredentials()).toBe(false); + }); + + it("should get static voices", async () => { + const voices = await client.getVoices(); + expect(voices).toBeDefined(); + expect(voices.length).toBeGreaterThan(0); + expect(voices[0]).toHaveProperty("id"); + expect(voices[0]).toHaveProperty("name"); + expect(voices[0]).toHaveProperty("provider"); + }); + + it("should have correct voice provider", async () => { + const voices = await client.getVoices(); + expect(voices[0].provider).toBe("deepgram"); + }); + + it("should throw on synthToBytes with bad api key", async () => { + const c = new DeepgramTTSClient({ apiKey: "bad-key" }); + await expect(c.synthToBytes("Hello")).rejects.toThrow(); + }); + + it("should throw on synthToBytestream with bad api key", async () => { + const c = new DeepgramTTSClient({ apiKey: "bad-key" }); + await expect(c.synthToBytestream("Hello")).rejects.toThrow(); + }); + + it("should strip SSML before synthesis", async () => { + const c = new DeepgramTTSClient({ apiKey: "bad-key" }); + await expect(c.synthToBytes("Hello world")).rejects.toThrow(); + }); + + it("should have required credentials", () => { + const c = new DeepgramTTSClient({ apiKey: "test" }); + expect((c as any).getRequiredCredentials()).toEqual(["apiKey"]); + }); + + it("should build model param from voice and model", async () => { + const c = new DeepgramTTSClient({ apiKey: "bad-key" }); + c.setProperty("voice", "aura-2-apollo-en"); + c.setProperty("model", "aura-2"); + await expect(c.synthToBytes("test")).rejects.toThrow("Deepgram API error"); + }); +}); diff --git a/src/__tests__/elevenlabs-audio-tags.test.ts b/src/__tests__/elevenlabs-audio-tags.test.ts new file mode 100644 index 0000000..b3dd705 --- /dev/null +++ b/src/__tests__/elevenlabs-audio-tags.test.ts @@ -0,0 +1,94 @@ +import { describe, it, expect, beforeEach } from "@jest/globals"; +import { ElevenLabsTTSClient } from "../engines/elevenlabs"; + +describe("ElevenLabs Audio Tag Support", () => { + let client: ElevenLabsTTSClient; + + beforeEach(() => { + client = new ElevenLabsTTSClient({ apiKey: "test-api-key" }); + }); + + describe("eleven_v3 audio tag passthrough", () => { + it("should pass audio tags through for eleven_v3 model", () => { + client.setProperty("model", "eleven_v3"); + const result = (client as any).processAudioTags("Hello [laugh] world"); + expect(result).toBe("Hello [laugh] world"); + }); + + it("should pass multiple audio tags through for eleven_v3", () => { + client.setProperty("model", "eleven_v3"); + const result = (client as any).processAudioTags("[sigh] Hello [laugh] world [cheer]"); + expect(result).toBe("[sigh] Hello [laugh] world [cheer]"); + }); + + it("should pass text without tags unchanged for eleven_v3", () => { + client.setProperty("model", "eleven_v3"); + const result = (client as any).processAudioTags("Hello world"); + expect(result).toBe("Hello world"); + }); + + it("should pass per-request model override for v3 audio tags", () => { + client.setProperty("model", "eleven_multilingual_v2"); + const result = (client as any).processAudioTags("Hello [laugh] world", { model: "eleven_v3" }); + expect(result).toBe("Hello [laugh] world"); + }); + }); + + describe("non-v3 models strip audio tags", () => { + it("should strip audio tags for eleven_multilingual_v2", () => { + client.setProperty("model", "eleven_multilingual_v2"); + const result = (client as any).processAudioTags("Hello [laugh] world"); + expect(result).not.toContain("[laugh]"); + expect(result).toContain("Hello"); + expect(result).toContain("world"); + }); + + it("should strip audio tags for eleven_flash_v2", () => { + client.setProperty("model", "eleven_flash_v2"); + const result = (client as any).processAudioTags("Hello [sigh] world"); + expect(result).not.toContain("[sigh]"); + }); + + it("should strip audio tags for eleven_flash_v2_5", () => { + client.setProperty("model", "eleven_flash_v2_5"); + const result = (client as any).processAudioTags("[laugh] Hello"); + expect(result).not.toContain("[laugh]"); + }); + + it("should clean up whitespace after stripping", () => { + client.setProperty("model", "eleven_multilingual_v2"); + const result = (client as any).processAudioTags("Hello [laugh] world"); + expect(result).toBe("Hello world"); + }); + + it("should handle text without tags unchanged for non-v3", () => { + client.setProperty("model", "eleven_multilingual_v2"); + const result = (client as any).processAudioTags("Hello world"); + expect(result).toBe("Hello world"); + }); + }); + + describe("model configuration", () => { + it("should default to eleven_multilingual_v2", () => { + expect(client.getProperty("model")).toBe("eleven_multilingual_v2"); + }); + + it("should allow setting model to eleven_v3", () => { + client.setProperty("model", "eleven_v3"); + expect(client.getProperty("model")).toBe("eleven_v3"); + }); + + it("should accept model via constructor credentials", () => { + const c = new ElevenLabsTTSClient({ apiKey: "test", model: "eleven_v3" }); + expect(c.getProperty("model")).toBe("eleven_v3"); + }); + + it("should accept model via properties", () => { + const c = new ElevenLabsTTSClient({ + apiKey: "test", + properties: { model: "eleven_v3" }, + }); + expect(c.getProperty("model")).toBe("eleven_v3"); + }); + }); +}); diff --git a/src/__tests__/fishaudio.test.ts b/src/__tests__/fishaudio.test.ts new file mode 100644 index 0000000..f005cc9 --- /dev/null +++ b/src/__tests__/fishaudio.test.ts @@ -0,0 +1,107 @@ +import { describe, it, expect, jest, beforeEach } from "@jest/globals"; +import { FishAudioTTSClient } from "../engines/fishaudio"; +import { createTTSClient } from "../factory"; + +describe("FishAudioTTSClient", () => { + let client: FishAudioTTSClient; + + beforeEach(() => { + client = new FishAudioTTSClient({ apiKey: "test-api-key" }); + }); + + it("should initialize with default values", () => { + expect(client).toBeDefined(); + expect(client.getProperty("model")).toBe("s2-pro"); + }); + + it("should initialize with custom model", () => { + const c = new FishAudioTTSClient({ apiKey: "test", model: "s2" }); + expect(c.getProperty("model")).toBe("s2"); + }); + + it("should initialize with properties", () => { + const c = new FishAudioTTSClient({ apiKey: "test", properties: { model: "s2" } }); + expect(c.getProperty("model")).toBe("s2"); + }); + + it("should set and get model", () => { + client.setProperty("model", "s2"); + expect(client.getProperty("model")).toBe("s2"); + }); + + it("should set and get voice via base class voiceId", () => { + client.setVoice("test-voice-ref"); + expect(client.getProperty("voice")).toBe("test-voice-ref"); + }); + + it("should pass audio tags through for s2-pro", () => { + (client as any).model = "s2-pro"; + const result = (client as any).processAudioTags("Hello [laugh] world"); + expect(result).toContain("[laugh]"); + }); + + it("should strip audio tags for non-s2-pro models", () => { + (client as any).model = "s2"; + const result = (client as any).processAudioTags("Hello [laugh] world"); + expect(result).not.toContain("[laugh]"); + expect(result).toContain("Hello world"); + }); + + it("should return false for checkCredentials without api key", async () => { + const c = new FishAudioTTSClient({}); + expect(await c.checkCredentials()).toBe(false); + }); + + it("should require apiKey credential", () => { + expect((client as any).getRequiredCredentials()).toEqual(["apiKey"]); + }); + + it("should create via factory", () => { + const c = createTTSClient("fishaudio", { apiKey: "test" }); + expect(c).toBeInstanceOf(FishAudioTTSClient); + }); + + it("should throw on synthToBytes with bad api key", async () => { + const c = new FishAudioTTSClient({ apiKey: "bad-key" }); + await expect(c.synthToBytes("Hello")).rejects.toThrow(); + }); + + it("should throw on synthToBytestream with bad api key", async () => { + const c = new FishAudioTTSClient({ apiKey: "bad-key" }); + await expect(c.synthToBytestream("Hello")).rejects.toThrow(); + }); + + it("should strip SSML", async () => { + const result = await (client as any).prepareText("Hello world"); + expect(result).not.toContain(""); + expect(result).toContain("Hello world"); + }); + + it("should handle word timings", () => { + (client as any)._createEstimatedWordTimings("Hello world test"); + expect((client as any).timings.length).toBe(3); + }); + + it("should support events", () => { + const fn = jest.fn(); + client.on("start", fn); + (client as any).emit("start"); + expect(fn).toHaveBeenCalled(); + }); + + it("should provide credential status", async () => { + const status = await client.getCredentialStatus(); + expect(status.engine).toBe("fishaudio"); + expect(status.requiresCredentials).toBe(true); + }); + + it("should have correct sample rate", () => { + expect((client as any).sampleRate).toBe(44100); + }); + + it("should send model as header", async () => { + const c = new FishAudioTTSClient({ apiKey: "bad-key" }); + c.setProperty("model", "s2-pro"); + await expect(c.synthToBytes("test")).rejects.toThrow("Fish Audio API error"); + }); +}); diff --git a/src/__tests__/hume.test.ts b/src/__tests__/hume.test.ts new file mode 100644 index 0000000..2e4b5ca --- /dev/null +++ b/src/__tests__/hume.test.ts @@ -0,0 +1,101 @@ +import { describe, it, expect, jest, beforeEach } from "@jest/globals"; +import { HumeTTSClient } from "../engines/hume"; +import { createTTSClient } from "../factory"; + +describe("HumeTTSClient", () => { + let client: HumeTTSClient; + + beforeEach(() => { + client = new HumeTTSClient({ apiKey: "test-api-key" }); + }); + + it("should initialize with default values", () => { + expect(client).toBeDefined(); + expect(client.getProperty("model")).toBe("octave-2"); + }); + + it("should initialize with custom model", () => { + const c = new HumeTTSClient({ apiKey: "test", model: "octave-1" }); + expect(c.getProperty("model")).toBe("octave-1"); + }); + + it("should initialize with properties", () => { + const c = new HumeTTSClient({ apiKey: "test", properties: { model: "octave-1" } }); + expect(c.getProperty("model")).toBe("octave-1"); + }); + + it("should set and get model", () => { + client.setProperty("model", "octave-1"); + expect(client.getProperty("model")).toBe("octave-1"); + }); + + it("should set and get voice via base class voiceId", () => { + client.setVoice("test-voice"); + expect(client.getProperty("voice")).toBe("test-voice"); + }); + + it("should resolve version for octave-2", () => { + expect((client as any).resolveVersion("octave-2")).toBe("2"); + }); + + it("should resolve version for octave-1", () => { + expect((client as any).resolveVersion("octave-1")).toBe("1"); + }); + + it("should return undefined version for unknown model", () => { + expect((client as any).resolveVersion("unknown")).toBeUndefined(); + }); + + it("should return false for checkCredentials without api key", async () => { + const c = new HumeTTSClient({}); + expect(await c.checkCredentials()).toBe(false); + }); + + it("should require apiKey credential", () => { + expect((client as any).getRequiredCredentials()).toEqual(["apiKey"]); + }); + + it("should create via factory", () => { + const c = createTTSClient("hume", { apiKey: "test" }); + expect(c).toBeInstanceOf(HumeTTSClient); + }); + + it("should throw on synthToBytes with bad api key", async () => { + const c = new HumeTTSClient({ apiKey: "bad-key" }); + await expect(c.synthToBytes("Hello")).rejects.toThrow(); + }); + + it("should throw on synthToBytestream with bad api key", async () => { + const c = new HumeTTSClient({ apiKey: "bad-key" }); + await expect(c.synthToBytestream("Hello")).rejects.toThrow(); + }); + + it("should strip SSML", async () => { + const result = await (client as any).prepareText("Hello world"); + expect(result).not.toContain(""); + expect(result).toContain("Hello world"); + }); + + it("should handle word timings", () => { + (client as any)._createEstimatedWordTimings("Hello world test"); + expect((client as any).timings.length).toBe(3); + }); + + it("should support events", () => { + const fn = jest.fn(); + client.on("start", fn); + (client as any).emit("start"); + expect(fn).toHaveBeenCalled(); + }); + + it("should provide credential status", async () => { + const status = await client.getCredentialStatus(); + expect(status.engine).toBe("hume"); + expect(status.requiresCredentials).toBe(true); + }); + + it("should use streaming endpoint for synthToBytestream", async () => { + const c = new HumeTTSClient({ apiKey: "bad-key" }); + await expect(c.synthToBytestream("Hello")).rejects.toThrow(); + }); +}); diff --git a/src/__tests__/mistral.test.ts b/src/__tests__/mistral.test.ts new file mode 100644 index 0000000..cfb9686 --- /dev/null +++ b/src/__tests__/mistral.test.ts @@ -0,0 +1,20 @@ +import { describe, it, expect, jest, beforeEach } from "@jest/globals"; +import { MistralTTSClient } from "../engines/mistral"; +import { createTTSClient } from "../factory"; + +describe("MistralTTSClient", () => { + let client: MistralTTSClient; + beforeEach(() => { client = new MistralTTSClient({ apiKey: "test" }); }); + + it("initializes with defaults", () => { + expect(client.getProperty("model")).toBe("voxtral-mini-tts-2603"); + }); + it("sets model", () => { client.setProperty("model", "other"); expect(client.getProperty("model")).toBe("other"); }); + it("sets voice via voiceId", () => { client.setVoice("v1"); expect(client.getProperty("voice")).toBe("v1"); }); + it("checks credentials without key", async () => { expect(await new MistralTTSClient({}).checkCredentials()).toBe(false); }); + it("creates via factory", () => { expect(createTTSClient("mistral", { apiKey: "t" })).toBeInstanceOf(MistralTTSClient); }); + it("strips SSML", async () => { expect(await (client as any).prepareText("Hi")).toBe("Hi"); }); + it("creates word timings", () => { (client as any)._createEstimatedWordTimings("a b c"); expect((client as any).timings.length).toBe(3); }); + it("supports events", () => { const fn = jest.fn(); client.on("end", fn); (client as any).emit("end"); expect(fn).toHaveBeenCalled(); }); + it("credential status", async () => { const s = await client.getCredentialStatus(); expect(s.engine).toBe("mistral"); }); +}); diff --git a/src/__tests__/murf.test.ts b/src/__tests__/murf.test.ts new file mode 100644 index 0000000..16bbd29 --- /dev/null +++ b/src/__tests__/murf.test.ts @@ -0,0 +1,20 @@ +import { describe, it, expect, jest, beforeEach } from "@jest/globals"; +import { MurfTTSClient } from "../engines/murf"; +import { createTTSClient } from "../factory"; + +describe("MurfTTSClient", () => { + let client: MurfTTSClient; + beforeEach(() => { client = new MurfTTSClient({ apiKey: "test" }); }); + + it("initializes with defaults", () => { expect(client.getProperty("model")).toBe("GEN2"); }); + it("sets model to FALCON", () => { client.setProperty("model", "FALCON"); expect(client.getProperty("model")).toBe("FALCON"); }); + it("sets voice via voiceId", () => { client.setVoice("en-US-owen"); expect(client.getProperty("voice")).toBe("en-US-owen"); }); + it("checks credentials without key", async () => { expect(await new MurfTTSClient({}).checkCredentials()).toBe(false); }); + it("creates via factory", () => { expect(createTTSClient("murf", { apiKey: "t" })).toBeInstanceOf(MurfTTSClient); }); + it("gets voices", async () => { const v = await client.getVoices(); expect(v.length).toBeGreaterThan(0); expect(v[0].provider).toBe("murf"); }); + it("filters by language", async () => { const v = await client.getVoicesByLanguage("en"); expect(v.length).toBeGreaterThan(0); }); + it("strips SSML", async () => { expect(await (client as any).prepareText("Hi")).toBe("Hi"); }); + it("creates word timings", () => { (client as any)._createEstimatedWordTimings("a b"); expect((client as any).timings.length).toBe(2); }); + it("supports events", () => { const fn = jest.fn(); client.on("start", fn); (client as any).emit("start"); expect(fn).toHaveBeenCalled(); }); + it("credential status", async () => { const s = await client.getCredentialStatus(); expect(s.engine).toBe("murf"); }); +}); diff --git a/src/__tests__/resemble.test.ts b/src/__tests__/resemble.test.ts new file mode 100644 index 0000000..ae0b0f4 --- /dev/null +++ b/src/__tests__/resemble.test.ts @@ -0,0 +1,17 @@ +import { describe, it, expect, jest, beforeEach } from "@jest/globals"; +import { ResembleTTSClient } from "../engines/resemble"; +import { createTTSClient } from "../factory"; + +describe("ResembleTTSClient", () => { + let client: ResembleTTSClient; + beforeEach(() => { client = new ResembleTTSClient({ apiKey: "test" }); }); + + it("initializes with defaults", () => { expect(client).toBeDefined(); }); + it("sets voice via voiceId", () => { client.setVoice("uuid-123"); expect(client.getProperty("voice")).toBe("uuid-123"); }); + it("checks credentials without key", async () => { expect(await new ResembleTTSClient({}).checkCredentials()).toBe(false); }); + it("creates via factory", () => { expect(createTTSClient("resemble", { apiKey: "t" })).toBeInstanceOf(ResembleTTSClient); }); + it("strips SSML", async () => { expect(await (client as any).prepareText("Hi")).toBe("Hi"); }); + it("creates word timings", () => { (client as any)._createEstimatedWordTimings("a b"); expect((client as any).timings.length).toBe(2); }); + it("supports events", () => { const fn = jest.fn(); client.on("start", fn); (client as any).emit("start"); expect(fn).toHaveBeenCalled(); }); + it("credential status", async () => { const s = await client.getCredentialStatus(); expect(s.engine).toBe("resemble"); }); +}); diff --git a/src/__tests__/unrealspeech.test.ts b/src/__tests__/unrealspeech.test.ts new file mode 100644 index 0000000..6eb312c --- /dev/null +++ b/src/__tests__/unrealspeech.test.ts @@ -0,0 +1,18 @@ +import { describe, it, expect, jest, beforeEach } from "@jest/globals"; +import { UnrealSpeechTTSClient } from "../engines/unrealspeech"; +import { createTTSClient } from "../factory"; + +describe("UnrealSpeechTTSClient", () => { + let client: UnrealSpeechTTSClient; + beforeEach(() => { client = new UnrealSpeechTTSClient({ apiKey: "test" }); }); + + it("initializes with defaults", () => { expect(client.getProperty("voice")).toBe("Sierra"); }); + it("sets voice via voiceId", () => { client.setVoice("Dan"); expect(client.getProperty("voice")).toBe("Dan"); }); + it("checks credentials without key", async () => { expect(await new UnrealSpeechTTSClient({}).checkCredentials()).toBe(false); }); + it("creates via factory", () => { expect(createTTSClient("unrealspeech", { apiKey: "t" })).toBeInstanceOf(UnrealSpeechTTSClient); }); + it("gets voices", async () => { const v = await client.getVoices(); expect(v.length).toBeGreaterThan(0); expect(v[0].provider).toBe("unrealspeech"); }); + it("strips SSML", async () => { expect(await (client as any).prepareText("Hi")).toBe("Hi"); }); + it("creates word timings", () => { (client as any)._createEstimatedWordTimings("a b c"); expect((client as any).timings.length).toBe(3); }); + it("supports events", () => { const fn = jest.fn(); client.on("end", fn); (client as any).emit("end"); expect(fn).toHaveBeenCalled(); }); + it("has correct engine name", () => { expect((client as any).constructor.name).toBe("UnrealSpeechTTSClient"); }); +}); diff --git a/src/__tests__/xai.test.ts b/src/__tests__/xai.test.ts new file mode 100644 index 0000000..3cd266d --- /dev/null +++ b/src/__tests__/xai.test.ts @@ -0,0 +1,107 @@ +import { describe, it, expect, jest, beforeEach } from "@jest/globals"; +import { XaiTTSClient } from "../engines/xai"; +import { createTTSClient } from "../factory"; + +describe("XaiTTSClient", () => { + let client: XaiTTSClient; + + beforeEach(() => { + client = new XaiTTSClient({ apiKey: "test-api-key" }); + }); + + it("should initialize with default values", () => { + expect(client).toBeDefined(); + expect(client.getProperty("model")).toBe("grok-tts"); + expect(client.getProperty("language")).toBe("auto"); + }); + + it("should initialize with custom model", () => { + const c = new XaiTTSClient({ apiKey: "test", model: "grok-tts" }); + expect(c.getProperty("model")).toBe("grok-tts"); + }); + + it("should initialize with properties", () => { + const c = new XaiTTSClient({ apiKey: "test", properties: { language: "en" } }); + expect(c.getProperty("language")).toBe("en"); + }); + + it("should set and get model", () => { + client.setProperty("model", "grok-tts"); + expect(client.getProperty("model")).toBe("grok-tts"); + }); + + it("should set and get voice via base class voiceId", () => { + client.setVoice("orion-56"); + expect(client.getProperty("voice")).toBe("orion-56"); + }); + + it("should set and get language", () => { + client.setProperty("language", "fr"); + expect(client.getProperty("language")).toBe("fr"); + }); + + it("should return false for checkCredentials without api key", async () => { + const c = new XaiTTSClient({}); + expect(await c.checkCredentials()).toBe(false); + }); + + it("should require apiKey credential", () => { + expect((client as any).getRequiredCredentials()).toEqual(["apiKey"]); + }); + + it("should get static voices", async () => { + const voices = await client.getVoices(); + expect(voices.length).toBeGreaterThan(0); + expect(voices[0]).toHaveProperty("id"); + expect(voices[0]).toHaveProperty("provider"); + }); + + it("should filter voices by language", async () => { + const voices = await client.getVoicesByLanguage("en"); + expect(voices.length).toBeGreaterThan(0); + }); + + it("should create via factory", () => { + const c = createTTSClient("xai", { apiKey: "test" }); + expect(c).toBeInstanceOf(XaiTTSClient); + }); + + it("should throw on synthToBytes with bad api key", async () => { + const c = new XaiTTSClient({ apiKey: "bad-key" }); + await expect(c.synthToBytes("Hello")).rejects.toThrow(); + }); + + it("should throw on synthToBytestream with bad api key", async () => { + const c = new XaiTTSClient({ apiKey: "bad-key" }); + await expect(c.synthToBytestream("Hello")).rejects.toThrow(); + }); + + it("should pass audio tags through (native support)", () => { + const result = (client as any).processAudioTags("Hello [laugh] world"); + expect(result).toContain("[laugh]"); + }); + + it("should strip SSML", async () => { + const result = await (client as any).prepareText("Hello world"); + expect(result).not.toContain(""); + expect(result).toContain("Hello world"); + }); + + it("should handle word timings", () => { + (client as any)._createEstimatedWordTimings("Hello world"); + expect((client as any).timings.length).toBe(2); + }); + + it("should support events", () => { + const fn = jest.fn(); + client.on("start", fn); + (client as any).emit("start"); + expect(fn).toHaveBeenCalled(); + }); + + it("should provide credential status", async () => { + const status = await client.getCredentialStatus(); + expect(status.engine).toBe("xai"); + expect(status.requiresCredentials).toBe(true); + }); +}); diff --git a/src/browser.ts b/src/browser.ts index 6dd3e40..0b41343 100644 --- a/src/browser.ts +++ b/src/browser.ts @@ -6,34 +6,39 @@ // Core components export { AbstractTTSClient } from "./core/abstract-tts"; -export { SSMLBuilder } from "./ssml/builder"; -export { SpeechMarkdownConverter } from "./markdown/converter-browser"; -export * as SpeechMarkdown from "./markdown/converter-browser"; -export { configureSpeechMarkdown } from "./markdown/converter-browser"; - // Browser-compatible engines export { AzureTTSClient } from "./engines/azure"; +export { CartesiaTTSClient } from "./engines/cartesia"; +export { DeepgramTTSClient } from "./engines/deepgram"; export { ElevenLabsTTSClient } from "./engines/elevenlabs"; +export { EspeakBrowserTTSClient } from "./engines/espeak-wasm"; +export { FishAudioTTSClient } from "./engines/fishaudio"; export { GoogleTTSClient } from "./engines/google"; +export { HumeTTSClient } from "./engines/hume"; +export { MistralTTSClient } from "./engines/mistral"; +export { ModelsLabTTSClient } from "./engines/modelslab"; +export { MurfTTSClient } from "./engines/murf"; export { OpenAITTSClient } from "./engines/openai"; export { PlayHTTTSClient } from "./engines/playht"; export { PollyTTSClient } from "./engines/polly"; -export { WatsonTTSClient } from "./engines/watson"; -export { WitAITTSClient } from "./engines/witai"; +export { ResembleTTSClient } from "./engines/resemble"; export { SherpaOnnxWasmTTSClient } from "./engines/sherpaonnx-wasm"; -export { EspeakBrowserTTSClient } from "./engines/espeak-wasm"; +export { UnrealSpeechTTSClient } from "./engines/unrealspeech"; export { UpliftAITTSClient } from "./engines/upliftai"; -export { ModelsLabTTSClient } from "./engines/modelslab"; - +export { WatsonTTSClient } from "./engines/watson"; +export { WitAITTSClient } from "./engines/witai"; +export { XaiTTSClient } from "./engines/xai"; // Browser-compatible factory (excludes server-only engines) export { createBrowserTTSClient } from "./factory-browser"; +export * as SpeechMarkdown from "./markdown/converter-browser"; +export { configureSpeechMarkdown, SpeechMarkdownConverter } from "./markdown/converter-browser"; +export { SSMLBuilder } from "./ssml/builder"; // Mock client for testing (if available) // Note: This is conditionally exported in factory.ts instead -// Utilities -export { estimateWordBoundaries } from "./utils/word-timing-estimator"; -export { isBrowser, isNode } from "./utils/environment"; - // Types export * from "./types"; +export { isBrowser, isNode } from "./utils/environment"; +// Utilities +export { estimateWordBoundaries } from "./utils/word-timing-estimator"; diff --git a/src/engines/cartesia.ts b/src/engines/cartesia.ts new file mode 100644 index 0000000..c7f7a8a --- /dev/null +++ b/src/engines/cartesia.ts @@ -0,0 +1,356 @@ +import { AbstractTTSClient } from "../core/abstract-tts"; +import * as SSMLUtils from "../core/ssml-utils"; +import * as SpeechMarkdown from "../markdown/converter"; +import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; +import { getFetch } from "../utils/fetch-utils"; + +const fetch = getFetch(); + +const AUDIO_TAG_REGEX = /\[[^\]]+\]/g; + +const CARTESIA_PASSTHROUGH_TAGS = ["laughter"]; + +const CARTESIA_EMOTIONS = [ + "neutral", + "angry", + "excited", + "content", + "sad", + "scared", + "happy", + "euphoric", + "anxious", + "panicked", + "calm", + "confident", + "curious", + "frustrated", + "sarcastic", + "melancholic", + "surprised", + "disgusted", + "contemplative", + "determined", + "proud", + "distant", + "skeptical", + "mysterious", + "anticipation", + "grateful", + "affectionate", + "sympathetic", + "nostalgic", + "wistful", + "apologetic", + "hesitant", + "insecure", + "confused", + "resigned", + "alarmed", + "bored", + "tired", + "rejected", + "hurt", + "disappointed", + "dejected", + "guilty", + "envious", + "contempt", + "threatened", + "agitated", + "outraged", + "mad", + "triumphant", + "amazed", + "flirtatious", + "joking/comedic", + "serene", + "peaceful", + "enthusiastic", + "elated", + "trust", +]; + +export interface CartesiaTTSOptions extends SpeakOptions { + model?: string; + voice?: string; + format?: "mp3" | "wav" | "ogg" | "opus" | "aac" | "flac" | "pcm"; + outputDir?: string; + outputFile?: string; + returnWordBoundaries?: boolean; + onEnd?: () => void; + providerOptions?: Record; +} + +export interface CartesiaTTSCredentials extends TTSCredentials { + apiKey?: string; + baseURL?: string; + model?: string; + properties?: Record | string; + propertiesJson?: string; +} + +export class CartesiaTTSClient extends AbstractTTSClient { + private apiKey: string; + private baseUrl: string; + private model: string; + private outputFormat: Record; + + constructor(credentials: CartesiaTTSCredentials = {}) { + super(credentials); + this.apiKey = credentials.apiKey || process.env.CARTESIA_API_KEY || ""; + this.baseUrl = credentials.baseURL || "https://api.cartesia.ai"; + this.model = (credentials as any).model || "sonic-3"; + this.voiceId = "694f938dd2a74762ba554ff8e2a9d786"; + this.outputFormat = { + container: "wav", + encoding: "pcm_f32le", + sample_rate: 44100, + }; + this.sampleRate = 44100; + + this.applyCredentialProperties(credentials); + } + + private applyCredentialProperties(credentials: CartesiaTTSCredentials): void { + const rawProps = + (credentials as any).properties ?? + (credentials as any).propertiesJson ?? + (credentials as any).propertiesJSON; + + if (rawProps) { + let parsed: Record | null = null; + if (typeof rawProps === "string") { + try { + parsed = JSON.parse(rawProps); + } catch { + /* ignore */ + } + } else if (typeof rawProps === "object") { + parsed = rawProps as Record; + } + if (parsed) { + for (const [key, value] of Object.entries(parsed)) { + this.setProperty(key, value); + } + } + } + } + + private processAudioTags(text: string): string { + if (this.model !== "sonic-3") { + return text.replace(AUDIO_TAG_REGEX, "").replace(/\s+/g, " ").trim(); + } + + const tags = text.match(AUDIO_TAG_REGEX) ?? []; + if (tags.length === 0) return text; + + let processed = text; + for (const tag of tags) { + const inner = tag.slice(1, -1).toLowerCase(); + if (CARTESIA_PASSTHROUGH_TAGS.includes(inner)) continue; + if (CARTESIA_EMOTIONS.includes(inner)) { + processed = processed.replace(tag, ``); + continue; + } + processed = processed.replace(tag, ""); + } + return processed.replace(/\s+/g, " ").trim(); + } + + private async prepareText(text: string, options?: SpeakOptions): Promise { + let processedText = text; + + if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) { + const ssml = await SpeechMarkdown.toSSML(processedText, "w3c"); + processedText = SSMLUtils.stripSSML(ssml); + } + + if (SSMLUtils.isSSML(processedText)) { + processedText = SSMLUtils.stripSSML(processedText); + } + + processedText = this.processAudioTags(processedText); + return processedText; + } + + setModel(model: string): void { + this.model = model; + } + + setVoice(voiceId: string): void { + this.voiceId = voiceId; + } + + getProperty(property: string): any { + switch (property) { + case "model": + return this.model; + case "voice": + return this.voiceId; + case "outputFormat": + return this.outputFormat; + default: + return super.getProperty(property); + } + } + + setProperty(property: string, value: any): void { + switch (property) { + case "model": + this.setModel(value); + break; + case "voice": + this.setVoice(value); + break; + case "outputFormat": + if (typeof value === "object") this.outputFormat = value as Record; + break; + default: + super.setProperty(property, value); + break; + } + } + + async checkCredentials(): Promise { + if (!this.apiKey) return false; + try { + const response = await fetch(`${this.baseUrl}/voices`, { + method: "GET", + headers: { + "X-API-Key": this.apiKey, + "Cartesia-Version": "2025-04-16", + }, + }); + return response.ok; + } catch { + return false; + } + } + + protected getRequiredCredentials(): string[] { + return ["apiKey"]; + } + + protected async _getVoices(): Promise { + try { + const response = await fetch(`${this.baseUrl}/voices`, { + method: "GET", + headers: { + "X-API-Key": this.apiKey, + "Cartesia-Version": "2025-04-16", + }, + }); + if (!response.ok) return []; + return await response.json(); + } catch { + return []; + } + } + + protected async _mapVoicesToUnified(rawVoices: any[]): Promise { + return rawVoices.map((voice) => ({ + id: voice.id, + name: voice.name, + gender: (voice.description?.toLowerCase().includes("female") + ? "Female" + : voice.description?.toLowerCase().includes("male") + ? "Male" + : "Unknown") as "Male" | "Female" | "Unknown", + languageCodes: voice.language + ? [ + { + bcp47: voice.language, + iso639_3: voice.language.split("-")[0], + display: voice.language, + }, + ] + : [{ bcp47: "en-US", iso639_3: "eng", display: "English" }], + provider: "cartesia" as any, + })); + } + + async synthToBytes(text: string, options: CartesiaTTSOptions = {}): Promise { + const preparedText = await this.prepareText(text, options); + const voiceId = options.voice || this.voiceId || "694f938dd2a74762ba554ff8e2a9d786"; + + const body: Record = { + output_format: this.outputFormat, + ...options.providerOptions, + model_id: options.model || this.model, + transcript: preparedText, + voice: { mode: "id", id: voiceId }, + }; + + const response = await fetch(`${this.baseUrl}/tts/bytes`, { + method: "POST", + headers: { + "Content-Type": "application/json", + "X-API-Key": this.apiKey, + "Cartesia-Version": "2025-04-16", + }, + body: JSON.stringify(body), + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error( + `Cartesia API error: ${response.status} ${response.statusText} - ${errorText}` + ); + } + + const arrayBuffer = await response.arrayBuffer(); + this._createEstimatedWordTimings(preparedText); + return new Uint8Array(arrayBuffer); + } + + async synthToBytestream( + text: string, + options: CartesiaTTSOptions = {} + ): Promise<{ + audioStream: ReadableStream; + wordBoundaries: Array<{ text: string; offset: number; duration: number }>; + }> { + const preparedText = await this.prepareText(text, options); + const voiceId = options.voice || this.voiceId || "694f938dd2a74762ba554ff8e2a9d786"; + + const body: Record = { + output_format: this.outputFormat, + ...options.providerOptions, + model_id: options.model || this.model, + transcript: preparedText, + voice: { mode: "id", id: voiceId }, + }; + + const response = await fetch(`${this.baseUrl}/tts/bytes`, { + method: "POST", + headers: { + "Content-Type": "application/json", + "X-API-Key": this.apiKey, + "Cartesia-Version": "2025-04-16", + }, + body: JSON.stringify(body), + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error( + `Cartesia API error: ${response.status} ${response.statusText} - ${errorText}` + ); + } + + if (!response.body) { + const arrayBuffer = await response.arrayBuffer(); + const audioData = new Uint8Array(arrayBuffer); + const readableStream = new ReadableStream({ + start(controller) { + controller.enqueue(audioData); + controller.close(); + }, + }); + return { audioStream: readableStream, wordBoundaries: [] }; + } + + return { audioStream: response.body, wordBoundaries: [] }; + } +} diff --git a/src/engines/deepgram.ts b/src/engines/deepgram.ts new file mode 100644 index 0000000..8299eeb --- /dev/null +++ b/src/engines/deepgram.ts @@ -0,0 +1,269 @@ +import { AbstractTTSClient } from "../core/abstract-tts"; +import * as SSMLUtils from "../core/ssml-utils"; +import * as SpeechMarkdown from "../markdown/converter"; +import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; +import { getFetch } from "../utils/fetch-utils"; + +const fetch = getFetch(); + +export interface DeepgramTTSOptions extends SpeakOptions { + model?: string; + voice?: string; + format?: "mp3" | "wav" | "ogg" | "opus" | "aac" | "flac" | "pcm"; + outputDir?: string; + outputFile?: string; + returnWordBoundaries?: boolean; + onEnd?: () => void; + providerOptions?: Record; +} + +export interface DeepgramTTSCredentials extends TTSCredentials { + apiKey?: string; + baseURL?: string; + model?: string; + properties?: Record | string; + propertiesJson?: string; +} + +export class DeepgramTTSClient extends AbstractTTSClient { + private apiKey: string; + private baseUrl: string; + private model: string; + + static readonly VOICES = [ + { id: "aura-asteria-english", name: "Asteria", gender: "Female" as const, language: "en-US" }, + { id: "aura-luna-english", name: "Luna", gender: "Female" as const, language: "en-US" }, + { id: "aura-stella-english", name: "Stella", gender: "Female" as const, language: "en-US" }, + { id: "aura-athena-english", name: "Athena", gender: "Female" as const, language: "en-US" }, + { id: "aura-hera-english", name: "Hera", gender: "Female" as const, language: "en-US" }, + { id: "aura-orion-english", name: "Orion", gender: "Male" as const, language: "en-US" }, + { id: "aura-arcas-english", name: "Arcas", gender: "Male" as const, language: "en-US" }, + { id: "aura-perseus-english", name: "Perseus", gender: "Male" as const, language: "en-US" }, + { id: "aura-angus-english", name: "Angus", gender: "Male" as const, language: "en-US" }, + { id: "aura-orpheus-english", name: "Orpheus", gender: "Male" as const, language: "en-US" }, + { id: "aura-helios-english", name: "Helios", gender: "Male" as const, language: "en-US" }, + { id: "aura-zeus-english", name: "Zeus", gender: "Male" as const, language: "en-US" }, + { id: "aura-2-andromeda-en", name: "Andromeda", gender: "Female" as const, language: "en-US" }, + { + id: "aura-2-cassiopeia-en", + name: "Cassiopeia", + gender: "Female" as const, + language: "en-US", + }, + { id: "aura-2-dianna-en", name: "Dianna", gender: "Female" as const, language: "en-US" }, + { id: "aura-2-thalia-en", name: "Thalia", gender: "Female" as const, language: "en-US" }, + { id: "aura-2-algernon-en", name: "Algernon", gender: "Male" as const, language: "en-US" }, + { + id: "aura-2-bellerophon-en", + name: "Bellerophon", + gender: "Male" as const, + language: "en-US", + }, + { id: "aura-2-callisto-en", name: "Callisto", gender: "Female" as const, language: "en-US" }, + { id: "aura-2-apollo-en", name: "Apollo", gender: "Male" as const, language: "en-US" }, + ]; + + constructor(credentials: DeepgramTTSCredentials = {}) { + super(credentials); + this.apiKey = credentials.apiKey || process.env.DEEPGRAM_API_KEY || ""; + this.baseUrl = credentials.baseURL || "https://api.deepgram.com/v1"; + this.model = (credentials as any).model || "aura-2"; + this.voiceId = "aura-2-apollo-en"; + + this.applyCredentialProperties(credentials); + } + + private applyCredentialProperties(credentials: DeepgramTTSCredentials): void { + const rawProps = + (credentials as any).properties ?? + (credentials as any).propertiesJson ?? + (credentials as any).propertiesJSON; + + if (rawProps) { + let parsed: Record | null = null; + if (typeof rawProps === "string") { + try { + parsed = JSON.parse(rawProps); + } catch { + /* ignore */ + } + } else if (typeof rawProps === "object") { + parsed = rawProps as Record; + } + if (parsed) { + for (const [key, value] of Object.entries(parsed)) { + this.setProperty(key, value); + } + } + } + } + + private async prepareText(text: string, options?: SpeakOptions): Promise { + let processedText = text; + + if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) { + const ssml = await SpeechMarkdown.toSSML(processedText, "w3c"); + processedText = SSMLUtils.stripSSML(ssml); + } + + if (SSMLUtils.isSSML(processedText)) { + processedText = SSMLUtils.stripSSML(processedText); + } + + return processedText; + } + + setModel(model: string): void { + this.model = model; + } + + setVoice(voiceId: string): void { + this.voiceId = voiceId; + } + + getProperty(property: string): any { + switch (property) { + case "model": + return this.model; + case "voice": + return this.voiceId; + default: + return super.getProperty(property); + } + } + + setProperty(property: string, value: any): void { + switch (property) { + case "model": + this.setModel(value); + break; + case "voice": + this.setVoice(value); + break; + default: + super.setProperty(property, value); + break; + } + } + + async checkCredentials(): Promise { + if (!this.apiKey) return false; + try { + const response = await fetch(`${this.baseUrl}/voices`, { + method: "GET", + headers: { + Authorization: `Token ${this.apiKey}`, + }, + }); + return response.ok; + } catch { + return false; + } + } + + protected getRequiredCredentials(): string[] { + return ["apiKey"]; + } + + protected async _getVoices(): Promise { + return DeepgramTTSClient.VOICES; + } + + protected async _mapVoicesToUnified(rawVoices: any[]): Promise { + return rawVoices.map((voice) => ({ + id: voice.id, + name: voice.name, + gender: voice.gender as "Male" | "Female" | "Unknown", + languageCodes: [ + { + bcp47: voice.language || "en-US", + iso639_3: (voice.language || "en-US").split("-")[0], + display: voice.language || "English (US)", + }, + ], + provider: "deepgram" as any, + })); + } + + async synthToBytes(text: string, options: DeepgramTTSOptions = {}): Promise { + const preparedText = await this.prepareText(text, options); + + const voiceParam = options.voice || this.voiceId || "aura-2-apollo-en"; + const modelParam = `${options.model || this.model}-${voiceParam}`; + const url = `${this.baseUrl}/speak?model=${encodeURIComponent(modelParam)}`; + + const body: Record = { + ...options.providerOptions, + text: preparedText, + }; + + const response = await fetch(url, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Token ${this.apiKey}`, + }, + body: JSON.stringify(body), + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error( + `Deepgram API error: ${response.status} ${response.statusText} - ${errorText}` + ); + } + + const arrayBuffer = await response.arrayBuffer(); + this._createEstimatedWordTimings(preparedText); + return new Uint8Array(arrayBuffer); + } + + async synthToBytestream( + text: string, + options: DeepgramTTSOptions = {} + ): Promise<{ + audioStream: ReadableStream; + wordBoundaries: Array<{ text: string; offset: number; duration: number }>; + }> { + const preparedText = await this.prepareText(text, options); + + const voiceParam = options.voice || this.voiceId || "aura-2-apollo-en"; + const modelParam = `${options.model || this.model}-${voiceParam}`; + const url = `${this.baseUrl}/speak?model=${encodeURIComponent(modelParam)}`; + + const body: Record = { + ...options.providerOptions, + text: preparedText, + }; + + const response = await fetch(url, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Token ${this.apiKey}`, + }, + body: JSON.stringify(body), + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error( + `Deepgram API error: ${response.status} ${response.statusText} - ${errorText}` + ); + } + + if (!response.body) { + const arrayBuffer = await response.arrayBuffer(); + const audioData = new Uint8Array(arrayBuffer); + const readableStream = new ReadableStream({ + start(controller) { + controller.enqueue(audioData); + controller.close(); + }, + }); + return { audioStream: readableStream, wordBoundaries: [] }; + } + + return { audioStream: response.body, wordBoundaries: [] }; + } +} diff --git a/src/engines/elevenlabs.ts b/src/engines/elevenlabs.ts index fe70528..bcafc83 100644 --- a/src/engines/elevenlabs.ts +++ b/src/engines/elevenlabs.ts @@ -433,8 +433,12 @@ export class ElevenLabsTTSClient extends AbstractTTSClient { } } + private static readonly AUDIO_TAG_REGEX = /\[[^\]]+\]/g; + + private static readonly V3_AUDIO_TAG_MODELS = ["eleven_v3"]; + /** - * Prepare text for synthesis by stripping SSML tags + * Prepare text for synthesis by stripping SSML tags and processing audio tags * @param text Text to prepare * @param options Synthesis options * @returns Prepared text @@ -444,21 +448,50 @@ export class ElevenLabsTTSClient extends AbstractTTSClient { // Convert from Speech Markdown if requested if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) { - // Convert to SSML first, then strip SSML tags - // Use "elevenlabs" platform for ElevenLabs-specific Speech Markdown features const ssml = await SpeechMarkdown.toSSML(processedText, "elevenlabs"); processedText = this._stripSSML(ssml); } // If text is SSML, strip the tags as ElevenLabs doesn't support SSML - // and has its own emotion analysis if (this._isSSML(processedText)) { processedText = this._stripSSML(processedText); } + // Process audio tags based on model + processedText = this.processAudioTags( + processedText, + options as ElevenLabsTTSOptions | undefined + ); + return processedText; } + /** + * Process audio tags ([laugh], [sigh], etc.) based on the model. + * eleven_v3 natively supports audio tags — pass them through. + * For all other models, strip audio tags. + */ + private processAudioTags(text: string, options?: ElevenLabsTTSOptions): string { + const modelId = this.resolveModelId(options); + const isAudioTagModel = ElevenLabsTTSClient.V3_AUDIO_TAG_MODELS.some((m) => + modelId.startsWith(m) + ); + + if (isAudioTagModel) { + return text; + } + + if (!ElevenLabsTTSClient.AUDIO_TAG_REGEX.test(text)) { + return text; + } + + const stripped = text + .replace(ElevenLabsTTSClient.AUDIO_TAG_REGEX, "") + .replace(/\s+/g, " ") + .trim(); + return stripped; + } + /** * Convert text to audio bytes * @param text Text to synthesize diff --git a/src/engines/fishaudio.ts b/src/engines/fishaudio.ts new file mode 100644 index 0000000..e054727 --- /dev/null +++ b/src/engines/fishaudio.ts @@ -0,0 +1,268 @@ +import { AbstractTTSClient } from "../core/abstract-tts"; +import * as SSMLUtils from "../core/ssml-utils"; +import * as SpeechMarkdown from "../markdown/converter"; +import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; +import { getFetch } from "../utils/fetch-utils"; + +const fetch = getFetch(); + +const AUDIO_TAG_MODELS = ["s2-pro"]; + +const AUDIO_TAG_REGEX = /\[[^\]]+\]/g; + +export interface FishAudioTTSOptions extends SpeakOptions { + model?: string; + voice?: string; + providerOptions?: Record; +} + +export interface FishAudioTTSCredentials extends TTSCredentials { + apiKey?: string; + baseURL?: string; + model?: string; + properties?: Record | string; + propertiesJson?: string; +} + +export class FishAudioTTSClient extends AbstractTTSClient { + private apiKey: string; + private baseUrl: string; + private model: string; + + constructor(credentials: FishAudioTTSCredentials = {}) { + super(credentials); + this.apiKey = credentials.apiKey || process.env.FISH_AUDIO_API_KEY || ""; + this.baseUrl = credentials.baseURL || "https://api.fish.audio"; + this.model = (credentials as any).model || "s2-pro"; + this.voiceId = ""; + this.sampleRate = 44100; + + this.applyCredentialProperties(credentials); + } + + private applyCredentialProperties(credentials: FishAudioTTSCredentials): void { + const rawProps = + (credentials as any).properties ?? + (credentials as any).propertiesJson ?? + (credentials as any).propertiesJSON; + + if (rawProps) { + let parsed: Record | null = null; + if (typeof rawProps === "string") { + try { + parsed = JSON.parse(rawProps); + } catch { + /* ignore */ + } + } else if (typeof rawProps === "object") { + parsed = rawProps as Record; + } + if (parsed) { + for (const [key, value] of Object.entries(parsed)) { + this.setProperty(key, value); + } + } + } + } + + private processAudioTags(text: string): string { + if (AUDIO_TAG_MODELS.includes(this.model)) { + return text; + } + + if (!AUDIO_TAG_REGEX.test(text)) return text; + + return text.replace(AUDIO_TAG_REGEX, "").replace(/\s+/g, " ").trim(); + } + + private async prepareText(text: string, options?: SpeakOptions): Promise { + let processedText = text; + + if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) { + const ssml = await SpeechMarkdown.toSSML(processedText, "w3c"); + processedText = SSMLUtils.stripSSML(ssml); + } + + if (SSMLUtils.isSSML(processedText)) { + processedText = SSMLUtils.stripSSML(processedText); + } + + processedText = this.processAudioTags(processedText); + return processedText; + } + + setModel(model: string): void { + this.model = model; + } + + setVoice(voiceId: string): void { + this.voiceId = voiceId; + } + + getProperty(property: string): any { + switch (property) { + case "model": + return this.model; + case "voice": + return this.voiceId; + default: + return super.getProperty(property); + } + } + + setProperty(property: string, value: any): void { + switch (property) { + case "model": + this.setModel(value); + break; + case "voice": + this.setVoice(value); + break; + default: + super.setProperty(property, value); + break; + } + } + + async checkCredentials(): Promise { + if (!this.apiKey) return false; + try { + const response = await fetch(`${this.baseUrl}/v1/tts`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + model: this.model, + }, + body: JSON.stringify({ text: "test" }), + }); + return response.ok; + } catch { + return false; + } + } + + protected getRequiredCredentials(): string[] { + return ["apiKey"]; + } + + protected async _getVoices(): Promise { + try { + const response = await fetch(`${this.baseUrl}/v1/model`, { + method: "GET", + headers: { + Authorization: `Bearer ${this.apiKey}`, + }, + }); + if (!response.ok) return []; + const data = await response.json(); + return Array.isArray(data) ? data : []; + } catch { + return []; + } + } + + protected async _mapVoicesToUnified(rawVoices: any[]): Promise { + return rawVoices + .filter((v: any) => v.type === "tts" || v.task === "tts") + .map((voice: any) => ({ + id: voice._id || voice.id, + name: voice.title || voice.name || "Unknown", + gender: (voice.gender || "Unknown") as "Male" | "Female" | "Unknown", + languageCodes: voice.languages + ? voice.languages.map((lang: string) => ({ + bcp47: lang, + iso639_3: lang.split("-")[0], + display: lang, + })) + : [{ bcp47: "en", iso639_3: "eng", display: "English" }], + provider: "fishaudio" as any, + })); + } + + async synthToBytes(text: string, options: FishAudioTTSOptions = {}): Promise { + const preparedText = await this.prepareText(text, options); + const modelId = options.model || this.model; + const voiceId = options.voice || this.voiceId; + + const body: Record = { + ...options.providerOptions, + text: preparedText, + }; + if (voiceId) { + body.reference_id = voiceId; + } + + const response = await fetch(`${this.baseUrl}/v1/tts`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + model: modelId, + }, + body: JSON.stringify(body), + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error( + `Fish Audio API error: ${response.status} ${response.statusText} - ${errorText}` + ); + } + + const arrayBuffer = await response.arrayBuffer(); + this._createEstimatedWordTimings(preparedText); + return new Uint8Array(arrayBuffer); + } + + async synthToBytestream( + text: string, + options: FishAudioTTSOptions = {} + ): Promise<{ + audioStream: ReadableStream; + wordBoundaries: Array<{ text: string; offset: number; duration: number }>; + }> { + const preparedText = await this.prepareText(text, options); + const modelId = options.model || this.model; + const voiceId = options.voice || this.voiceId; + + const body: Record = { + ...options.providerOptions, + text: preparedText, + }; + if (voiceId) { + body.reference_id = voiceId; + } + + const response = await fetch(`${this.baseUrl}/v1/tts`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + model: modelId, + }, + body: JSON.stringify(body), + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error( + `Fish Audio API error: ${response.status} ${response.statusText} - ${errorText}` + ); + } + + if (!response.body) { + const arrayBuffer = await response.arrayBuffer(); + const audioData = new Uint8Array(arrayBuffer); + const readableStream = new ReadableStream({ + start(controller) { + controller.enqueue(audioData); + controller.close(); + }, + }); + return { audioStream: readableStream, wordBoundaries: [] }; + } + + return { audioStream: response.body, wordBoundaries: [] }; + } +} diff --git a/src/engines/hume.ts b/src/engines/hume.ts new file mode 100644 index 0000000..bc737da --- /dev/null +++ b/src/engines/hume.ts @@ -0,0 +1,240 @@ +import { AbstractTTSClient } from "../core/abstract-tts"; +import * as SSMLUtils from "../core/ssml-utils"; +import * as SpeechMarkdown from "../markdown/converter"; +import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; +import { getFetch } from "../utils/fetch-utils"; + +const fetch = getFetch(); + +export interface HumeTTSOptions extends SpeakOptions { + model?: string; + voice?: string; + providerOptions?: Record; +} + +export interface HumeTTSCredentials extends TTSCredentials { + apiKey?: string; + baseURL?: string; + model?: string; + properties?: Record | string; + propertiesJson?: string; +} + +export class HumeTTSClient extends AbstractTTSClient { + private apiKey: string; + private baseUrl: string; + private model: string; + + constructor(credentials: HumeTTSCredentials = {}) { + super(credentials); + this.apiKey = credentials.apiKey || process.env.HUME_API_KEY || ""; + this.baseUrl = credentials.baseURL || "https://api.hume.ai/v0"; + this.model = (credentials as any).model || "octave-2"; + this.voiceId = "aac4caff-e2e1-4088-9d58-a29c5d22dce6"; + this.sampleRate = 24000; + + this.applyCredentialProperties(credentials); + } + + private applyCredentialProperties(credentials: HumeTTSCredentials): void { + const rawProps = + (credentials as any).properties ?? + (credentials as any).propertiesJson ?? + (credentials as any).propertiesJSON; + + if (rawProps) { + let parsed: Record | null = null; + if (typeof rawProps === "string") { + try { + parsed = JSON.parse(rawProps); + } catch { + /* ignore */ + } + } else if (typeof rawProps === "object") { + parsed = rawProps as Record; + } + if (parsed) { + for (const [key, value] of Object.entries(parsed)) { + this.setProperty(key, value); + } + } + } + } + + private resolveVersion(modelId: string): string | undefined { + if (modelId === "octave-2") return "2"; + if (modelId === "octave-1") return "1"; + return undefined; + } + + private async prepareText(text: string, options?: SpeakOptions): Promise { + let processedText = text; + + if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) { + const ssml = await SpeechMarkdown.toSSML(processedText, "w3c"); + processedText = SSMLUtils.stripSSML(ssml); + } + + if (SSMLUtils.isSSML(processedText)) { + processedText = SSMLUtils.stripSSML(processedText); + } + + return processedText; + } + + setModel(model: string): void { + this.model = model; + } + + setVoice(voiceId: string): void { + this.voiceId = voiceId; + } + + getProperty(property: string): any { + switch (property) { + case "model": + return this.model; + case "voice": + return this.voiceId; + default: + return super.getProperty(property); + } + } + + setProperty(property: string, value: any): void { + switch (property) { + case "model": + this.setModel(value); + break; + case "voice": + this.setVoice(value); + break; + default: + super.setProperty(property, value); + break; + } + } + + async checkCredentials(): Promise { + if (!this.apiKey) return false; + try { + const response = await fetch(`${this.baseUrl}/tts/file`, { + method: "POST", + headers: { + "Content-Type": "application/json", + "X-Hume-Api-Key": this.apiKey, + }, + body: JSON.stringify({ utterances: [{ text: "test" }] }), + }); + return response.ok; + } catch { + return false; + } + } + + protected getRequiredCredentials(): string[] { + return ["apiKey"]; + } + + protected async _getVoices(): Promise { + return []; + } + + protected async _mapVoicesToUnified(_rawVoices: any[]): Promise { + return []; + } + + async synthToBytes(text: string, options: HumeTTSOptions = {}): Promise { + const preparedText = await this.prepareText(text, options); + const modelId = options.model || this.model; + const voiceId = options.voice || this.voiceId; + + const utterance: Record = { text: preparedText }; + if (voiceId) { + utterance.voice = { name: voiceId, provider: "HUME_AI" }; + } + + const body: Record = { + ...options.providerOptions, + utterances: [utterance], + }; + + const version = this.resolveVersion(modelId); + if (version != null) { + body.version = version; + } + + const response = await fetch(`${this.baseUrl}/tts/file`, { + method: "POST", + headers: { + "Content-Type": "application/json", + "X-Hume-Api-Key": this.apiKey, + }, + body: JSON.stringify(body), + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error(`Hume API error: ${response.status} ${response.statusText} - ${errorText}`); + } + + const arrayBuffer = await response.arrayBuffer(); + this._createEstimatedWordTimings(preparedText); + return new Uint8Array(arrayBuffer); + } + + async synthToBytestream( + text: string, + options: HumeTTSOptions = {} + ): Promise<{ + audioStream: ReadableStream; + wordBoundaries: Array<{ text: string; offset: number; duration: number }>; + }> { + const preparedText = await this.prepareText(text, options); + const modelId = options.model || this.model; + const voiceId = options.voice || this.voiceId; + + const utterance: Record = { text: preparedText }; + if (voiceId) { + utterance.voice = { name: voiceId, provider: "HUME_AI" }; + } + + const body: Record = { + ...options.providerOptions, + utterances: [utterance], + }; + + const version = this.resolveVersion(modelId); + if (version != null) { + body.version = version; + } + + const response = await fetch(`${this.baseUrl}/tts/stream/file`, { + method: "POST", + headers: { + "Content-Type": "application/json", + "X-Hume-Api-Key": this.apiKey, + }, + body: JSON.stringify(body), + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error(`Hume API error: ${response.status} ${response.statusText} - ${errorText}`); + } + + if (!response.body) { + const arrayBuffer = await response.arrayBuffer(); + const audioData = new Uint8Array(arrayBuffer); + const readableStream = new ReadableStream({ + start(controller) { + controller.enqueue(audioData); + controller.close(); + }, + }); + return { audioStream: readableStream, wordBoundaries: [] }; + } + + return { audioStream: response.body, wordBoundaries: [] }; + } +} diff --git a/src/engines/mistral.ts b/src/engines/mistral.ts new file mode 100644 index 0000000..251de4c --- /dev/null +++ b/src/engines/mistral.ts @@ -0,0 +1,284 @@ +import { AbstractTTSClient } from "../core/abstract-tts"; +import * as SSMLUtils from "../core/ssml-utils"; +import * as SpeechMarkdown from "../markdown/converter"; +import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; +import { getFetch } from "../utils/fetch-utils"; + +const fetch = getFetch(); + +export interface MistralTTSOptions extends SpeakOptions { + model?: string; + voice?: string; + responseFormat?: "mp3" | "wav" | "opus"; + providerOptions?: Record; +} + +export interface MistralTTSCredentials extends TTSCredentials { + apiKey?: string; + baseURL?: string; + model?: string; + properties?: Record | string; + propertiesJson?: string; +} + +export class MistralTTSClient extends AbstractTTSClient { + private apiKey: string; + private baseUrl: string; + private model: string; + private responseFormat: string; + + constructor(credentials: MistralTTSCredentials = {}) { + super(credentials); + this.apiKey = credentials.apiKey || process.env.MISTRAL_API_KEY || ""; + this.baseUrl = credentials.baseURL || "https://api.mistral.ai/v1"; + this.model = (credentials as any).model || "voxtral-mini-tts-2603"; + this.voiceId = ""; + this.responseFormat = "mp3"; + this.sampleRate = 24000; + + this.applyCredentialProperties(credentials); + } + + private applyCredentialProperties(credentials: MistralTTSCredentials): void { + const rawProps = + (credentials as any).properties ?? + (credentials as any).propertiesJson ?? + (credentials as any).propertiesJSON; + + if (rawProps) { + let parsed: Record | null = null; + if (typeof rawProps === "string") { + try { + parsed = JSON.parse(rawProps); + } catch { + /* ignore */ + } + } else if (typeof rawProps === "object") { + parsed = rawProps as Record; + } + if (parsed) { + for (const [key, value] of Object.entries(parsed)) { + this.setProperty(key, value); + } + } + } + } + + private async prepareText(text: string, options?: SpeakOptions): Promise { + let processedText = text; + + if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) { + const ssml = await SpeechMarkdown.toSSML(processedText, "w3c"); + processedText = SSMLUtils.stripSSML(ssml); + } + + if (SSMLUtils.isSSML(processedText)) { + processedText = SSMLUtils.stripSSML(processedText); + } + + return processedText; + } + + setModel(model: string): void { + this.model = model; + } + + setVoice(voiceId: string): void { + this.voiceId = voiceId; + } + + getProperty(property: string): any { + switch (property) { + case "model": + return this.model; + case "voice": + return this.voiceId; + case "responseFormat": + return this.responseFormat; + default: + return super.getProperty(property); + } + } + + setProperty(property: string, value: any): void { + switch (property) { + case "model": + this.setModel(value); + break; + case "voice": + this.setVoice(value); + break; + case "responseFormat": + this.responseFormat = value; + break; + default: + super.setProperty(property, value); + break; + } + } + + async checkCredentials(): Promise { + if (!this.apiKey) return false; + try { + const response = await fetch(`${this.baseUrl}/models`, { + method: "GET", + headers: { + Authorization: `Bearer ${this.apiKey}`, + }, + }); + return response.ok; + } catch { + return false; + } + } + + protected getRequiredCredentials(): string[] { + return ["apiKey"]; + } + + protected async _getVoices(): Promise { + return []; + } + + protected async _mapVoicesToUnified(_rawVoices: any[]): Promise { + return []; + } + + async synthToBytes(text: string, options: MistralTTSOptions = {}): Promise { + const preparedText = await this.prepareText(text, options); + const modelId = options.model || this.model; + const voiceId = options.voice || this.voiceId; + + const body: Record = { + response_format: options.responseFormat || this.responseFormat, + ...options.providerOptions, + model: modelId, + input: preparedText, + }; + if (voiceId) { + body.voice_id = voiceId; + } + + const response = await fetch(`${this.baseUrl}/audio/speech`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }, + body: JSON.stringify(body), + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error( + `Mistral API error: ${response.status} ${response.statusText} - ${errorText}` + ); + } + + const json = (await response.json()) as { audio_data: string }; + const binaryStr = atob(json.audio_data); + const bytes = new Uint8Array(binaryStr.length); + for (let i = 0; i < binaryStr.length; i++) { + bytes[i] = binaryStr.charCodeAt(i); + } + + this._createEstimatedWordTimings(preparedText); + return bytes; + } + + async synthToBytestream( + text: string, + options: MistralTTSOptions = {} + ): Promise<{ + audioStream: ReadableStream; + wordBoundaries: Array<{ text: string; offset: number; duration: number }>; + }> { + const preparedText = await this.prepareText(text, options); + const modelId = options.model || this.model; + const voiceId = options.voice || this.voiceId; + + const body: Record = { + response_format: options.responseFormat || this.responseFormat, + ...options.providerOptions, + model: modelId, + input: preparedText, + stream: true, + }; + if (voiceId) { + body.voice_id = voiceId; + } + + const response = await fetch(`${this.baseUrl}/audio/speech`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + Accept: "text/event-stream", + }, + body: JSON.stringify(body), + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error( + `Mistral API error: ${response.status} ${response.statusText} - ${errorText}` + ); + } + + if (!response.body) { + const bytes = await this.synthToBytes(text, options); + const readableStream = new ReadableStream({ + start(controller) { + controller.enqueue(bytes); + controller.close(); + }, + }); + return { audioStream: readableStream, wordBoundaries: [] }; + } + + const sseStream = this.parseSseBase64Stream(response.body); + return { audioStream: sseStream, wordBoundaries: [] }; + } + + private parseSseBase64Stream(body: ReadableStream): ReadableStream { + const reader = body.getReader(); + const decoder = new TextDecoder(); + let buffer = ""; + + return new ReadableStream({ + async pull(controller) { + while (true) { + const { done, value } = await reader.read(); + if (done) { + controller.close(); + return; + } + + buffer += decoder.decode(value, { stream: true }); + const lines = buffer.split("\n"); + buffer = lines.pop() || ""; + + for (const line of lines) { + if (!line.startsWith("data: ")) continue; + const data = line.slice(6).trim(); + if (!data || data === "[DONE]") continue; + + try { + const json = JSON.parse(data); + if (json.type === "speech.audio.delta" && typeof json.audio_data === "string") { + const binaryStr = atob(json.audio_data); + const bytes = new Uint8Array(binaryStr.length); + for (let i = 0; i < binaryStr.length; i++) { + bytes[i] = binaryStr.charCodeAt(i); + } + controller.enqueue(bytes); + } + } catch { + /* skip malformed */ + } + } + } + }, + }); + } +} diff --git a/src/engines/murf.ts b/src/engines/murf.ts new file mode 100644 index 0000000..8da77a5 --- /dev/null +++ b/src/engines/murf.ts @@ -0,0 +1,266 @@ +import { AbstractTTSClient } from "../core/abstract-tts"; +import * as SSMLUtils from "../core/ssml-utils"; +import * as SpeechMarkdown from "../markdown/converter"; +import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; +import { getFetch } from "../utils/fetch-utils"; + +const fetch = getFetch(); + +export interface MurfTTSOptions extends SpeakOptions { + model?: string; + voice?: string; + providerOptions?: Record; +} + +export interface MurfTTSCredentials extends TTSCredentials { + apiKey?: string; + baseURL?: string; + model?: string; + properties?: Record | string; + propertiesJson?: string; +} + +export class MurfTTSClient extends AbstractTTSClient { + private apiKey: string; + private baseUrl: string; + private model: string; + + static readonly VOICES = [ + { id: "en-US-natalie", name: "Natalie", gender: "Female" as const, language: "en-US" }, + { id: "en-US-owen", name: "Owen", gender: "Male" as const, language: "en-US" }, + { id: "en-US-amira", name: "Amira", gender: "Female" as const, language: "en-US" }, + { id: "en-US-daniel", name: "Daniel", gender: "Male" as const, language: "en-US" }, + { id: "en-US-taylor", name: "Taylor", gender: "Female" as const, language: "en-US" }, + { id: "en-US-alex", name: "Alex", gender: "Male" as const, language: "en-US" }, + { id: "en-US-emily", name: "Emily", gender: "Female" as const, language: "en-US" }, + { id: "en-US-ben", name: "Ben", gender: "Male" as const, language: "en-US" }, + { id: "en-US-claire", name: "Claire", gender: "Female" as const, language: "en-US" }, + { id: "en-US-glen", name: "Glen", gender: "Male" as const, language: "en-US" }, + { id: "de-DE-detlef", name: "Detlef", gender: "Male" as const, language: "de-DE" }, + { id: "es-ES-rosalyn", name: "Rosalyn", gender: "Female" as const, language: "es-ES" }, + { id: "fr-FR-henri", name: "Henri", gender: "Male" as const, language: "fr-FR" }, + { id: "pt-BR-thomas", name: "Thomas", gender: "Male" as const, language: "pt-BR" }, + { id: "it-IT-giulia", name: "Giulia", gender: "Female" as const, language: "it-IT" }, + ]; + + constructor(credentials: MurfTTSCredentials = {}) { + super(credentials); + this.apiKey = credentials.apiKey || process.env.MURF_API_KEY || ""; + this.baseUrl = credentials.baseURL || "https://api.murf.ai/v1"; + this.model = (credentials as any).model || "GEN2"; + this.voiceId = "en-US-natalie"; + this.sampleRate = 24000; + + this.applyCredentialProperties(credentials); + } + + private applyCredentialProperties(credentials: MurfTTSCredentials): void { + const rawProps = + (credentials as any).properties ?? + (credentials as any).propertiesJson ?? + (credentials as any).propertiesJSON; + + if (rawProps) { + let parsed: Record | null = null; + if (typeof rawProps === "string") { + try { + parsed = JSON.parse(rawProps); + } catch { + /* ignore */ + } + } else if (typeof rawProps === "object") { + parsed = rawProps as Record; + } + if (parsed) { + for (const [key, value] of Object.entries(parsed)) { + this.setProperty(key, value); + } + } + } + } + + private async prepareText(text: string, options?: SpeakOptions): Promise { + let processedText = text; + + if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) { + const ssml = await SpeechMarkdown.toSSML(processedText, "w3c"); + processedText = SSMLUtils.stripSSML(ssml); + } + + if (SSMLUtils.isSSML(processedText)) { + processedText = SSMLUtils.stripSSML(processedText); + } + + return processedText; + } + + setModel(model: string): void { + this.model = model; + } + + setVoice(voiceId: string): void { + this.voiceId = voiceId; + } + + getProperty(property: string): any { + switch (property) { + case "model": + return this.model; + case "voice": + return this.voiceId; + default: + return super.getProperty(property); + } + } + + setProperty(property: string, value: any): void { + switch (property) { + case "model": + this.setModel(value); + break; + case "voice": + this.setVoice(value); + break; + default: + super.setProperty(property, value); + break; + } + } + + async checkCredentials(): Promise { + if (!this.apiKey) return false; + try { + const response = await fetch(`${this.baseUrl}/speech/generate`, { + method: "POST", + headers: { + "Content-Type": "application/json", + "api-key": this.apiKey, + }, + body: JSON.stringify({ voiceId: "test", text: "test", encodeAsBase64: true }), + }); + return response.status !== 401; + } catch { + return false; + } + } + + protected getRequiredCredentials(): string[] { + return ["apiKey"]; + } + + protected async _getVoices(): Promise { + return MurfTTSClient.VOICES; + } + + protected async _mapVoicesToUnified(rawVoices: any[]): Promise { + return rawVoices.map((voice) => ({ + id: voice.id, + name: voice.name, + gender: voice.gender as "Male" | "Female" | "Unknown", + languageCodes: [ + { + bcp47: voice.language || "en-US", + iso639_3: (voice.language || "en-US").split("-")[0], + display: voice.language || "English (US)", + }, + ], + provider: "murf" as any, + })); + } + + async synthToBytes(text: string, options: MurfTTSOptions = {}): Promise { + const preparedText = await this.prepareText(text, options); + const modelId = options.model || this.model; + const voiceId = options.voice || this.voiceId; + const isFalcon = modelId === "FALCON"; + + const body: Record = { + ...options.providerOptions, + voiceId, + text: preparedText, + }; + + if (isFalcon) { + body.model = "FALCON"; + } else { + body.encodeAsBase64 = true; + } + + const url = isFalcon ? `${this.baseUrl}/speech/stream` : `${this.baseUrl}/speech/generate`; + + const response = await fetch(url, { + method: "POST", + headers: { + "Content-Type": "application/json", + "api-key": this.apiKey, + }, + body: JSON.stringify(body), + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error(`Murf API error: ${response.status} ${response.statusText} - ${errorText}`); + } + + if (isFalcon) { + const arrayBuffer = await response.arrayBuffer(); + this._createEstimatedWordTimings(preparedText); + return new Uint8Array(arrayBuffer); + } + + const json = (await response.json()) as { encodedAudio: string }; + const binaryStr = atob(json.encodedAudio); + const bytes = new Uint8Array(binaryStr.length); + for (let i = 0; i < binaryStr.length; i++) { + bytes[i] = binaryStr.charCodeAt(i); + } + this._createEstimatedWordTimings(preparedText); + return bytes; + } + + async synthToBytestream( + text: string, + options: MurfTTSOptions = {} + ): Promise<{ + audioStream: ReadableStream; + wordBoundaries: Array<{ text: string; offset: number; duration: number }>; + }> { + const preparedText = await this.prepareText(text, options); + const modelId = options.model || this.model; + const voiceId = options.voice || this.voiceId; + + const body: Record = { + ...options.providerOptions, + voiceId, + text: preparedText, + model: modelId, + }; + + const response = await fetch(`${this.baseUrl}/speech/stream`, { + method: "POST", + headers: { + "Content-Type": "application/json", + "api-key": this.apiKey, + }, + body: JSON.stringify(body), + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error(`Murf API error: ${response.status} ${response.statusText} - ${errorText}`); + } + + if (!response.body) { + const bytes = await this.synthToBytes(text, options); + const readableStream = new ReadableStream({ + start(controller) { + controller.enqueue(bytes); + controller.close(); + }, + }); + return { audioStream: readableStream, wordBoundaries: [] }; + } + + return { audioStream: response.body, wordBoundaries: [] }; + } +} diff --git a/src/engines/resemble.ts b/src/engines/resemble.ts new file mode 100644 index 0000000..253f43f --- /dev/null +++ b/src/engines/resemble.ts @@ -0,0 +1,210 @@ +import { AbstractTTSClient } from "../core/abstract-tts"; +import * as SSMLUtils from "../core/ssml-utils"; +import * as SpeechMarkdown from "../markdown/converter"; +import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; +import { getFetch } from "../utils/fetch-utils"; + +const fetch = getFetch(); + +export interface ResembleTTSOptions extends SpeakOptions { + voice?: string; + providerOptions?: Record; +} + +export interface ResembleTTSCredentials extends TTSCredentials { + apiKey?: string; + baseURL?: string; + properties?: Record | string; + propertiesJson?: string; +} + +export class ResembleTTSClient extends AbstractTTSClient { + private apiKey: string; + private baseUrl: string; + + constructor(credentials: ResembleTTSCredentials = {}) { + super(credentials); + this.apiKey = credentials.apiKey || process.env.RESEMBLE_API_KEY || ""; + this.baseUrl = credentials.baseURL || "https://f.cluster.resemble.ai"; + this.voiceId = ""; + this.sampleRate = 22050; + + this.applyCredentialProperties(credentials); + } + + private applyCredentialProperties(credentials: ResembleTTSCredentials): void { + const rawProps = + (credentials as any).properties ?? + (credentials as any).propertiesJson ?? + (credentials as any).propertiesJSON; + + if (rawProps) { + let parsed: Record | null = null; + if (typeof rawProps === "string") { + try { + parsed = JSON.parse(rawProps); + } catch { + /* ignore */ + } + } else if (typeof rawProps === "object") { + parsed = rawProps as Record; + } + if (parsed) { + for (const [key, value] of Object.entries(parsed)) { + this.setProperty(key, value); + } + } + } + } + + private async prepareText(text: string, options?: SpeakOptions): Promise { + let processedText = text; + + if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) { + const ssml = await SpeechMarkdown.toSSML(processedText, "w3c"); + processedText = SSMLUtils.stripSSML(ssml); + } + + if (SSMLUtils.isSSML(processedText)) { + processedText = SSMLUtils.stripSSML(processedText); + } + + return processedText; + } + + setVoice(voiceId: string): void { + this.voiceId = voiceId; + } + + getProperty(property: string): any { + switch (property) { + case "voice": + return this.voiceId; + default: + return super.getProperty(property); + } + } + + setProperty(property: string, value: any): void { + switch (property) { + case "voice": + this.setVoice(value); + break; + default: + super.setProperty(property, value); + break; + } + } + + async checkCredentials(): Promise { + if (!this.apiKey) return false; + try { + const response = await fetch(`${this.baseUrl}/synthesize`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: this.apiKey, + }, + body: JSON.stringify({ voice_uuid: "test", data: "test" }), + }); + return response.status !== 401; + } catch { + return false; + } + } + + protected getRequiredCredentials(): string[] { + return ["apiKey"]; + } + + protected async _getVoices(): Promise { + return []; + } + + protected async _mapVoicesToUnified(_rawVoices: any[]): Promise { + return []; + } + + async synthToBytes(text: string, options: ResembleTTSOptions = {}): Promise { + const preparedText = await this.prepareText(text, options); + const voiceId = options.voice || this.voiceId; + + const body: Record = { + ...options.providerOptions, + voice_uuid: voiceId, + data: preparedText, + }; + + const response = await fetch(`${this.baseUrl}/synthesize`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: this.apiKey, + }, + body: JSON.stringify(body), + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error( + `Resemble API error: ${response.status} ${response.statusText} - ${errorText}` + ); + } + + const json = (await response.json()) as { audio_content: string }; + const binaryStr = atob(json.audio_content); + const bytes = new Uint8Array(binaryStr.length); + for (let i = 0; i < binaryStr.length; i++) { + bytes[i] = binaryStr.charCodeAt(i); + } + + this._createEstimatedWordTimings(preparedText); + return bytes; + } + + async synthToBytestream( + text: string, + options: ResembleTTSOptions = {} + ): Promise<{ + audioStream: ReadableStream; + wordBoundaries: Array<{ text: string; offset: number; duration: number }>; + }> { + const preparedText = await this.prepareText(text, options); + const voiceId = options.voice || this.voiceId; + + const body: Record = { + ...options.providerOptions, + voice_uuid: voiceId, + data: preparedText, + }; + + const response = await fetch(`${this.baseUrl}/stream`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: this.apiKey, + }, + body: JSON.stringify(body), + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error( + `Resemble API error: ${response.status} ${response.statusText} - ${errorText}` + ); + } + + if (!response.body) { + const bytes = await this.synthToBytes(text, options); + const readableStream = new ReadableStream({ + start(controller) { + controller.enqueue(bytes); + controller.close(); + }, + }); + return { audioStream: readableStream, wordBoundaries: [] }; + } + + return { audioStream: response.body, wordBoundaries: [] }; + } +} diff --git a/src/engines/unrealspeech.ts b/src/engines/unrealspeech.ts new file mode 100644 index 0000000..30c4b77 --- /dev/null +++ b/src/engines/unrealspeech.ts @@ -0,0 +1,243 @@ +import { AbstractTTSClient } from "../core/abstract-tts"; +import * as SSMLUtils from "../core/ssml-utils"; +import * as SpeechMarkdown from "../markdown/converter"; +import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; +import { getFetch } from "../utils/fetch-utils"; + +const fetch = getFetch(); + +export interface UnrealSpeechTTSOptions extends SpeakOptions { + voice?: string; + audioFormat?: "mp3" | "wav" | "pcm"; + providerOptions?: Record; +} + +export interface UnrealSpeechTTSCredentials extends TTSCredentials { + apiKey?: string; + baseURL?: string; + properties?: Record | string; + propertiesJson?: string; +} + +export class UnrealSpeechTTSClient extends AbstractTTSClient { + private apiKey: string; + private baseUrl: string; + + static readonly VOICES = [ + { id: "Sierra", name: "Sierra", gender: "Female" as const, language: "en-US" }, + { id: "Dan", name: "Dan", gender: "Male" as const, language: "en-US" }, + { id: "Will", name: "Will", gender: "Male" as const, language: "en-US" }, + { id: "Scarlett", name: "Scarlett", gender: "Female" as const, language: "en-US" }, + { id: "Liv", name: "Liv", gender: "Female" as const, language: "en-US" }, + { id: "Amy", name: "Amy", gender: "Female" as const, language: "en-US" }, + { id: "Eric", name: "Eric", gender: "Male" as const, language: "en-US" }, + { id: "Brian", name: "Brian", gender: "Male" as const, language: "en-US" }, + ]; + + constructor(credentials: UnrealSpeechTTSCredentials = {}) { + super(credentials); + this.apiKey = credentials.apiKey || process.env.UNREAL_SPEECH_API_KEY || ""; + this.baseUrl = credentials.baseURL || "https://api.v8.unrealspeech.com"; + this.voiceId = "Sierra"; + this.sampleRate = 24000; + + this.applyCredentialProperties(credentials); + } + + private applyCredentialProperties(credentials: UnrealSpeechTTSCredentials): void { + const rawProps = + (credentials as any).properties ?? + (credentials as any).propertiesJson ?? + (credentials as any).propertiesJSON; + + if (rawProps) { + let parsed: Record | null = null; + if (typeof rawProps === "string") { + try { + parsed = JSON.parse(rawProps); + } catch { + /* ignore */ + } + } else if (typeof rawProps === "object") { + parsed = rawProps as Record; + } + if (parsed) { + for (const [key, value] of Object.entries(parsed)) { + this.setProperty(key, value); + } + } + } + } + + private async prepareText(text: string, options?: SpeakOptions): Promise { + let processedText = text; + + if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) { + const ssml = await SpeechMarkdown.toSSML(processedText, "w3c"); + processedText = SSMLUtils.stripSSML(ssml); + } + + if (SSMLUtils.isSSML(processedText)) { + processedText = SSMLUtils.stripSSML(processedText); + } + + return processedText; + } + + setVoice(voiceId: string): void { + this.voiceId = voiceId; + } + + getProperty(property: string): any { + switch (property) { + case "voice": + return this.voiceId; + default: + return super.getProperty(property); + } + } + + setProperty(property: string, value: any): void { + switch (property) { + case "voice": + this.setVoice(value); + break; + default: + super.setProperty(property, value); + break; + } + } + + async checkCredentials(): Promise { + if (!this.apiKey) return false; + try { + const response = await fetch(`${this.baseUrl}/speech`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }, + body: JSON.stringify({ + Text: "test", + VoiceId: "Sierra", + AudioFormat: "mp3", + OutputFormat: "uri", + }), + }); + return response.status !== 401; + } catch { + return false; + } + } + + protected getRequiredCredentials(): string[] { + return ["apiKey"]; + } + + protected async _getVoices(): Promise { + return UnrealSpeechTTSClient.VOICES; + } + + protected async _mapVoicesToUnified(rawVoices: any[]): Promise { + return rawVoices.map((voice) => ({ + id: voice.id, + name: voice.name, + gender: voice.gender as "Male" | "Female" | "Unknown", + languageCodes: [ + { + bcp47: voice.language || "en-US", + iso639_3: (voice.language || "en-US").split("-")[0], + display: voice.language || "English (US)", + }, + ], + provider: "unrealspeech" as any, + })); + } + + async synthToBytes(text: string, options: UnrealSpeechTTSOptions = {}): Promise { + const preparedText = await this.prepareText(text, options); + const voiceId = options.voice || this.voiceId; + + const body: Record = { + ...options.providerOptions, + AudioFormat: options.audioFormat || "mp3", + OutputFormat: "uri", + VoiceId: voiceId, + Text: preparedText, + }; + + const response = await fetch(`${this.baseUrl}/speech`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }, + body: JSON.stringify(body), + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error( + `Unreal Speech API error: ${response.status} ${response.statusText} - ${errorText}` + ); + } + + const json = (await response.json()) as { OutputUri: string }; + const audioResponse = await fetch(json.OutputUri); + + if (!audioResponse.ok) { + throw new Error(`Unreal Speech download error: ${audioResponse.status}`); + } + + const arrayBuffer = await audioResponse.arrayBuffer(); + this._createEstimatedWordTimings(preparedText); + return new Uint8Array(arrayBuffer); + } + + async synthToBytestream( + text: string, + options: UnrealSpeechTTSOptions = {} + ): Promise<{ + audioStream: ReadableStream; + wordBoundaries: Array<{ text: string; offset: number; duration: number }>; + }> { + const preparedText = await this.prepareText(text, options); + const voiceId = options.voice || this.voiceId; + + const body: Record = { + ...options.providerOptions, + AudioFormat: options.audioFormat || "mp3", + VoiceId: voiceId, + Text: preparedText, + }; + + const response = await fetch(`${this.baseUrl}/stream`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }, + body: JSON.stringify(body), + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error( + `Unreal Speech API error: ${response.status} ${response.statusText} - ${errorText}` + ); + } + + if (!response.body) { + const bytes = await this.synthToBytes(text, options); + const readableStream = new ReadableStream({ + start(controller) { + controller.enqueue(bytes); + controller.close(); + }, + }); + return { audioStream: readableStream, wordBoundaries: [] }; + } + + return { audioStream: response.body, wordBoundaries: [] }; + } +} diff --git a/src/engines/xai.ts b/src/engines/xai.ts new file mode 100644 index 0000000..c469e9a --- /dev/null +++ b/src/engines/xai.ts @@ -0,0 +1,257 @@ +import { AbstractTTSClient } from "../core/abstract-tts"; +import * as SSMLUtils from "../core/ssml-utils"; +import * as SpeechMarkdown from "../markdown/converter"; +import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; +import { getFetch } from "../utils/fetch-utils"; + +const fetch = getFetch(); + +const AUDIO_TAG_REGEX = /\[[^\]]+\]/g; + +export interface XaiTTSOptions extends SpeakOptions { + model?: string; + voice?: string; + language?: string; + providerOptions?: Record; +} + +export interface XaiTTSCredentials extends TTSCredentials { + apiKey?: string; + baseURL?: string; + model?: string; + properties?: Record | string; + propertiesJson?: string; +} + +export class XaiTTSClient extends AbstractTTSClient { + private apiKey: string; + private baseUrl: string; + private model: string; + private language: string; + + static readonly VOICES = [ + { id: "avalon-47", name: "Avalon", gender: "Female" as const, language: "en" }, + { id: "orion-56", name: "Orion", gender: "Male" as const, language: "en" }, + { id: "luna-30", name: "Luna", gender: "Female" as const, language: "en" }, + { id: "atlas-84", name: "Atlas", gender: "Male" as const, language: "en" }, + { id: "aria-42", name: "Aria", gender: "Female" as const, language: "en" }, + { id: "cosmo-01", name: "Cosmo", gender: "Male" as const, language: "en" }, + ]; + + constructor(credentials: XaiTTSCredentials = {}) { + super(credentials); + this.apiKey = credentials.apiKey || process.env.XAI_API_KEY || ""; + this.baseUrl = credentials.baseURL || "https://api.x.ai/v1"; + this.model = (credentials as any).model || "grok-tts"; + this.voiceId = "avalon-47"; + this.language = "auto"; + this.sampleRate = 24000; + + this.applyCredentialProperties(credentials); + } + + private applyCredentialProperties(credentials: XaiTTSCredentials): void { + const rawProps = + (credentials as any).properties ?? + (credentials as any).propertiesJson ?? + (credentials as any).propertiesJSON; + + if (rawProps) { + let parsed: Record | null = null; + if (typeof rawProps === "string") { + try { + parsed = JSON.parse(rawProps); + } catch { + /* ignore */ + } + } else if (typeof rawProps === "object") { + parsed = rawProps as Record; + } + if (parsed) { + for (const [key, value] of Object.entries(parsed)) { + this.setProperty(key, value); + } + } + } + } + + private processAudioTags(text: string): string { + if (!AUDIO_TAG_REGEX.test(text)) return text; + return text; + } + + private async prepareText(text: string, options?: SpeakOptions): Promise { + let processedText = text; + + if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) { + const ssml = await SpeechMarkdown.toSSML(processedText, "w3c"); + processedText = SSMLUtils.stripSSML(ssml); + } + + if (SSMLUtils.isSSML(processedText)) { + processedText = SSMLUtils.stripSSML(processedText); + } + + processedText = this.processAudioTags(processedText); + return processedText; + } + + setModel(model: string): void { + this.model = model; + } + + setVoice(voiceId: string): void { + this.voiceId = voiceId; + } + + getProperty(property: string): any { + switch (property) { + case "model": + return this.model; + case "voice": + return this.voiceId; + case "language": + return this.language; + default: + return super.getProperty(property); + } + } + + setProperty(property: string, value: any): void { + switch (property) { + case "model": + this.setModel(value); + break; + case "voice": + this.setVoice(value); + break; + case "language": + this.language = value; + break; + default: + super.setProperty(property, value); + break; + } + } + + async checkCredentials(): Promise { + if (!this.apiKey) return false; + try { + const response = await fetch(`${this.baseUrl}/tts`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }, + body: JSON.stringify({ text: "test", language: "auto" }), + }); + return response.ok; + } catch { + return false; + } + } + + protected getRequiredCredentials(): string[] { + return ["apiKey"]; + } + + protected async _getVoices(): Promise { + return XaiTTSClient.VOICES; + } + + protected async _mapVoicesToUnified(rawVoices: any[]): Promise { + return rawVoices.map((voice) => ({ + id: voice.id, + name: voice.name, + gender: voice.gender as "Male" | "Female" | "Unknown", + languageCodes: [ + { + bcp47: voice.language || "en", + iso639_3: (voice.language || "en").split("-")[0], + display: voice.language || "English", + }, + ], + provider: "xai" as any, + })); + } + + async synthToBytes(text: string, options: XaiTTSOptions = {}): Promise { + const preparedText = await this.prepareText(text, options); + const voiceId = options.voice || this.voiceId; + + const body: Record = { + language: options.language || this.language, + ...options.providerOptions, + text: preparedText, + }; + if (voiceId) { + body.voice_id = voiceId; + } + + const response = await fetch(`${this.baseUrl}/tts`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }, + body: JSON.stringify(body), + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error(`xAI API error: ${response.status} ${response.statusText} - ${errorText}`); + } + + const arrayBuffer = await response.arrayBuffer(); + this._createEstimatedWordTimings(preparedText); + return new Uint8Array(arrayBuffer); + } + + async synthToBytestream( + text: string, + options: XaiTTSOptions = {} + ): Promise<{ + audioStream: ReadableStream; + wordBoundaries: Array<{ text: string; offset: number; duration: number }>; + }> { + const preparedText = await this.prepareText(text, options); + const voiceId = options.voice || this.voiceId; + + const body: Record = { + language: options.language || this.language, + ...options.providerOptions, + text: preparedText, + }; + if (voiceId) { + body.voice_id = voiceId; + } + + const response = await fetch(`${this.baseUrl}/tts`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }, + body: JSON.stringify(body), + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error(`xAI API error: ${response.status} ${response.statusText} - ${errorText}`); + } + + if (!response.body) { + const arrayBuffer = await response.arrayBuffer(); + const audioData = new Uint8Array(arrayBuffer); + const readableStream = new ReadableStream({ + start(controller) { + controller.enqueue(audioData); + controller.close(); + }, + }); + return { audioStream: readableStream, wordBoundaries: [] }; + } + + return { audioStream: response.body, wordBoundaries: [] }; + } +} diff --git a/src/factory-browser.ts b/src/factory-browser.ts index 80749c1..1c17a2e 100644 --- a/src/factory-browser.ts +++ b/src/factory-browser.ts @@ -1,16 +1,25 @@ // Browser-compatible factory for TTS clients import { AzureTTSClient } from "./engines/azure.js"; +import { CartesiaTTSClient } from "./engines/cartesia.js"; +import { DeepgramTTSClient } from "./engines/deepgram.js"; import { ElevenLabsTTSClient } from "./engines/elevenlabs.js"; import { EspeakBrowserTTSClient } from "./engines/espeak-wasm.js"; +import { FishAudioTTSClient } from "./engines/fishaudio.js"; import { GoogleTTSClient } from "./engines/google.js"; +import { HumeTTSClient } from "./engines/hume.js"; +import { MistralTTSClient } from "./engines/mistral.js"; +import { ModelsLabTTSClient } from "./engines/modelslab.js"; +import { MurfTTSClient } from "./engines/murf.js"; import { OpenAITTSClient } from "./engines/openai.js"; import { PlayHTTTSClient } from "./engines/playht.js"; import { PollyTTSClient } from "./engines/polly.js"; +import { ResembleTTSClient } from "./engines/resemble.js"; import { SherpaOnnxWasmTTSClient } from "./engines/sherpaonnx-wasm.js"; +import { UnrealSpeechTTSClient } from "./engines/unrealspeech.js"; +import { UpliftAITTSClient } from "./engines/upliftai.js"; import { WatsonTTSClient } from "./engines/watson.js"; import { WitAITTSClient } from "./engines/witai.js"; -import { UpliftAITTSClient } from "./engines/upliftai.js"; -import { ModelsLabTTSClient } from "./engines/modelslab.js"; +import { XaiTTSClient } from "./engines/xai.js"; import type { TTSCredentials } from "./types"; // Import MockTTSClient for testing @@ -30,13 +39,22 @@ try { export type SupportedBrowserTTS = | "azure" + | "cartesia" + | "deepgram" + | "fishaudio" | "google" + | "hume" + | "mistral" + | "murf" | "polly" | "elevenlabs" | "openai" | "playht" | "watson" | "witai" + | "xai" + | "resemble" + | "unrealspeech" | "upliftai" | "modelslab" | "sherpaonnx-wasm" @@ -83,6 +101,18 @@ export function createBrowserTTSClient(engine: SupportedBrowserTTS, credentials? return applyProperties( new AzureTTSClient(credentials as { subscriptionKey: string; region: string }) ); + case "cartesia": + return applyProperties( + new CartesiaTTSClient(credentials as import("./engines/cartesia").CartesiaTTSCredentials) + ); + case "deepgram": + return applyProperties( + new DeepgramTTSClient(credentials as import("./engines/deepgram").DeepgramTTSCredentials) + ); + case "fishaudio": + return applyProperties( + new FishAudioTTSClient(credentials as import("./engines/fishaudio").FishAudioTTSCredentials) + ); case "google": return applyProperties( new GoogleTTSClient(credentials as import("./engines/google").GoogleTTSCredentials) @@ -111,14 +141,40 @@ export function createBrowserTTSClient(engine: SupportedBrowserTTS, credentials? return applyProperties( new WitAITTSClient(credentials as import("./engines/witai").WitAITTSCredentials) ); + case "xai": + return applyProperties( + new XaiTTSClient(credentials as import("./engines/xai").XaiTTSCredentials) + ); case "upliftai": return applyProperties( new UpliftAITTSClient(credentials as import("./engines/upliftai").UpliftAITTSCredentials) ); + case "hume": + return applyProperties( + new HumeTTSClient(credentials as import("./engines/hume").HumeTTSCredentials) + ); + case "mistral": + return applyProperties( + new MistralTTSClient(credentials as import("./engines/mistral").MistralTTSCredentials) + ); + case "murf": + return applyProperties( + new MurfTTSClient(credentials as import("./engines/murf").MurfTTSCredentials) + ); case "modelslab": return applyProperties( new ModelsLabTTSClient(credentials as import("./engines/modelslab").ModelsLabTTSCredentials) ); + case "resemble": + return applyProperties( + new ResembleTTSClient(credentials as import("./engines/resemble").ResembleTTSCredentials) + ); + case "unrealspeech": + return applyProperties( + new UnrealSpeechTTSClient( + credentials as import("./engines/unrealspeech").UnrealSpeechTTSCredentials + ) + ); case "sherpaonnx-wasm": return applyProperties(new SherpaOnnxWasmTTSClient(credentials as any)); case "espeak-wasm": diff --git a/src/factory.ts b/src/factory.ts index 44318fb..e8f7606 100644 --- a/src/factory.ts +++ b/src/factory.ts @@ -1,19 +1,28 @@ // Factory for TTS clients (browser/server compatible) import { AzureTTSClient } from "./engines/azure.js"; +import { CartesiaTTSClient } from "./engines/cartesia.js"; +import { DeepgramTTSClient } from "./engines/deepgram.js"; import { ElevenLabsTTSClient } from "./engines/elevenlabs.js"; -import { EspeakWasmTTSClient } from "./engines/espeak-wasm.js"; import { EspeakTTSClient } from "./engines/espeak.js"; +import { EspeakWasmTTSClient } from "./engines/espeak-wasm.js"; +import { FishAudioTTSClient } from "./engines/fishaudio.js"; import { GoogleTTSClient } from "./engines/google.js"; +import { HumeTTSClient } from "./engines/hume.js"; +import { MistralTTSClient } from "./engines/mistral.js"; +import { ModelsLabTTSClient } from "./engines/modelslab.js"; +import { MurfTTSClient } from "./engines/murf.js"; import { OpenAITTSClient } from "./engines/openai.js"; import { PlayHTTTSClient } from "./engines/playht.js"; import { PollyTTSClient } from "./engines/polly.js"; -import { UpliftAITTSClient } from "./engines/upliftai.js"; -import { ModelsLabTTSClient } from "./engines/modelslab.js"; -import { SherpaOnnxWasmTTSClient } from "./engines/sherpaonnx-wasm.js"; +import { ResembleTTSClient } from "./engines/resemble.js"; +import { SAPITTSClient } from "./engines/sapi.js"; import { SherpaOnnxTTSClient } from "./engines/sherpaonnx.js"; +import { SherpaOnnxWasmTTSClient } from "./engines/sherpaonnx-wasm.js"; +import { UnrealSpeechTTSClient } from "./engines/unrealspeech.js"; +import { UpliftAITTSClient } from "./engines/upliftai.js"; import { WatsonTTSClient } from "./engines/watson.js"; import { WitAITTSClient } from "./engines/witai.js"; -import { SAPITTSClient } from "./engines/sapi.js"; +import { XaiTTSClient } from "./engines/xai.js"; import type { TTSCredentials } from "./types"; // Import MockTTSClient for testing @@ -33,13 +42,22 @@ try { export type SupportedTTS = | "azure" + | "cartesia" + | "deepgram" + | "fishaudio" | "google" + | "hume" + | "mistral" + | "murf" | "polly" | "elevenlabs" | "openai" | "playht" | "watson" | "witai" + | "xai" + | "resemble" + | "unrealspeech" | "upliftai" | "modelslab" | "sherpaonnx" @@ -89,6 +107,18 @@ export function createTTSClient(engine: SupportedTTS, credentials?: TTSCredentia return applyProperties( new AzureTTSClient(credentials as { subscriptionKey: string; region: string }) ); + case "cartesia": + return applyProperties( + new CartesiaTTSClient(credentials as import("./engines/cartesia").CartesiaTTSCredentials) + ); + case "deepgram": + return applyProperties( + new DeepgramTTSClient(credentials as import("./engines/deepgram").DeepgramTTSCredentials) + ); + case "fishaudio": + return applyProperties( + new FishAudioTTSClient(credentials as import("./engines/fishaudio").FishAudioTTSCredentials) + ); case "google": return applyProperties( new GoogleTTSClient(credentials as import("./engines/google").GoogleTTSCredentials) @@ -117,14 +147,40 @@ export function createTTSClient(engine: SupportedTTS, credentials?: TTSCredentia return applyProperties( new WitAITTSClient(credentials as import("./engines/witai").WitAITTSCredentials) ); + case "xai": + return applyProperties( + new XaiTTSClient(credentials as import("./engines/xai").XaiTTSCredentials) + ); case "upliftai": return applyProperties( new UpliftAITTSClient(credentials as import("./engines/upliftai").UpliftAITTSCredentials) ); + case "hume": + return applyProperties( + new HumeTTSClient(credentials as import("./engines/hume").HumeTTSCredentials) + ); + case "mistral": + return applyProperties( + new MistralTTSClient(credentials as import("./engines/mistral").MistralTTSCredentials) + ); + case "murf": + return applyProperties( + new MurfTTSClient(credentials as import("./engines/murf").MurfTTSCredentials) + ); case "modelslab": return applyProperties( new ModelsLabTTSClient(credentials as import("./engines/modelslab").ModelsLabTTSCredentials) ); + case "resemble": + return applyProperties( + new ResembleTTSClient(credentials as import("./engines/resemble").ResembleTTSCredentials) + ); + case "unrealspeech": + return applyProperties( + new UnrealSpeechTTSClient( + credentials as import("./engines/unrealspeech").UnrealSpeechTTSCredentials + ) + ); case "sherpaonnx": return applyProperties(new SherpaOnnxTTSClient(credentials as any)); case "sherpaonnx-wasm": diff --git a/src/index.ts b/src/index.ts index 96a64ff..40a056e 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,49 +1,49 @@ // Core exports export { AbstractTTSClient } from "./core/abstract-tts"; +export { AudioPlayback } from "./core/playback"; export * as SSMLUtils from "./core/ssml-utils"; export * as VoiceUtils from "./core/voice-utils"; -export { AudioPlayback } from "./core/playback"; - -// Factory export -export * from "./factory"; - -// SSML exports -export { SSMLBuilder } from "./ssml/builder"; - -// Markdown exports -export * as SpeechMarkdown from "./markdown/converter"; -export { configureSpeechMarkdown } from "./markdown/converter"; - -// Utility exports -export { getFetch, isFetchAvailable } from "./utils/fetch-utils"; - // Engine exports export { AzureTTSClient } from "./engines/azure"; +export { CartesiaTTSClient } from "./engines/cartesia"; +export { DeepgramTTSClient } from "./engines/deepgram"; export { ElevenLabsTTSClient } from "./engines/elevenlabs"; +export { EspeakNodeTTSClient, EspeakTTSClient } from "./engines/espeak"; +export { EspeakBrowserTTSClient, EspeakWasmTTSClient } from "./engines/espeak-wasm"; +export { FishAudioTTSClient } from "./engines/fishaudio"; export { GoogleTTSClient } from "./engines/google"; +export { HumeTTSClient } from "./engines/hume"; +export { MistralTTSClient } from "./engines/mistral"; +export { ModelsLabTTSClient } from "./engines/modelslab"; +export { MurfTTSClient } from "./engines/murf"; export { OpenAITTSClient } from "./engines/openai"; export { PlayHTTTSClient } from "./engines/playht"; export { PollyTTSClient } from "./engines/polly"; +export { ResembleTTSClient } from "./engines/resemble"; +export { SAPITTSClient } from "./engines/sapi"; export { SherpaOnnxTTSClient } from "./engines/sherpaonnx"; -// Note: The browser-only SherpaONNX WASM engine is not exported from the Node entry. -// Import it from 'js-tts-wrapper/browser' instead. -// export { SherpaOnnxWasmTTSClient } from "./engines/sherpaonnx-wasm"; -export { EspeakNodeTTSClient, EspeakTTSClient } from "./engines/espeak"; -export { EspeakBrowserTTSClient, EspeakWasmTTSClient } from "./engines/espeak-wasm"; +export { UnrealSpeechTTSClient } from "./engines/unrealspeech"; +export { UpliftAITTSClient } from "./engines/upliftai"; export { WatsonTTSClient } from "./engines/watson"; export { WitAITTSClient } from "./engines/witai"; -export { UpliftAITTSClient } from "./engines/upliftai"; -export { ModelsLabTTSClient } from "./engines/modelslab"; -export { SAPITTSClient } from "./engines/sapi"; - +export { XaiTTSClient } from "./engines/xai"; +// Factory export +export * from "./factory"; +// Markdown exports +export * as SpeechMarkdown from "./markdown/converter"; +export { configureSpeechMarkdown } from "./markdown/converter"; +// SSML exports +export { SSMLBuilder } from "./ssml/builder"; // Type exports export type { CredentialsCheckResult, + PropertyType, + SimpleCallback, SpeakOptions, - UnifiedVoice, TTSCredentials, TTSEventType, + UnifiedVoice, WordBoundaryCallback, - SimpleCallback, - PropertyType, } from "./types"; +// Utility exports +export { getFetch, isFetchAvailable } from "./utils/fetch-utils"; diff --git a/src/types.ts b/src/types.ts index 4132912..78d5eb3 100644 --- a/src/types.ts +++ b/src/types.ts @@ -113,7 +113,16 @@ export type UnifiedVoice = { | "sherpaonnx" | "sherpaonnx-wasm" | "espeak-ng" - | "sapi"; + | "sapi" + | "cartesia" + | "deepgram" + | "hume" + | "mistral" + | "murf" + | "resemble" + | "unrealspeech" + | "xai" + | "fishaudio"; /** * Language codes supported by this voice From f6aed9b9d5f49b91b45cfdd974842bf2396db718 Mon Sep 17 00:00:00 2001 From: will wade Date: Wed, 8 Apr 2026 12:51:11 +0100 Subject: [PATCH 2/7] docs: add 9 new engines to README and update BACKLOG - Add engine table entries for Cartesia, Deepgram, Hume, xAI, Fish Audio, Mistral, Murf, Unreal Speech, Resemble - Add engine-specific examples with usage notes - Update timing table, SSML table, and Speech Markdown table - Update factory pattern engine list - Move Mistral/Murf/Unreal/Resemble to Completed in BACKLOG --- README.md | 144 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 143 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e3dbac7..95e31f5 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,15 @@ A JavaScript/TypeScript library that provides a unified API for working with mul | `espeak-wasm` | `EspeakBrowserTTSClient` | Both | eSpeak NG | `mespeak` (Node.js) or meSpeak.js (browser) | | `sapi` | `SAPITTSClient` | Node.js | Windows Speech API (SAPI) | None (uses PowerShell) | | `witai` | `WitAITTSClient` | Both | Wit.ai | None (uses fetch API) | +| `cartesia` | `CartesiaTTSClient` | Both | Cartesia | None (uses fetch API) | +| `deepgram` | `DeepgramTTSClient` | Both | Deepgram | None (uses fetch API) | +| `hume` | `HumeTTSClient` | Both | Hume AI | None (uses fetch API) | +| `xai` | `XAITTSClient` | Both | xAI (Grok) | None (uses fetch API) | +| `fishaudio` | `FishAudioTTSClient` | Both | Fish Audio | None (uses fetch API) | +| `mistral` | `MistralTTSClient` | Both | Mistral AI | None (uses fetch API) | +| `murf` | `MurfTTSClient` | Both | Murf AI | None (uses fetch API) | +| `unrealspeech` | `UnrealSpeechTTSClient` | Both | Unreal Speech | None (uses fetch API) | +| `resemble` | `ResembleTTSClient` | Both | Resemble AI | None (uses fetch API) | **Factory Name**: Use with `createTTSClient('factory-name', credentials)` **Class Name**: Use with direct import `import { ClassName } from 'js-tts-wrapper'` @@ -90,6 +99,15 @@ A JavaScript/TypeScript library that provides a unified API for working with mul | **SherpaOnnx** | ✅ | Estimated | ❌ | Low | | **SherpaOnnx-WASM** | ✅ | Estimated | ❌ | Low | | **SAPI** | ✅ | Estimated | ❌ | Low | +| **Cartesia** | ✅ | Estimated | ❌ | Low | +| **Deepgram** | ✅ | Estimated | ❌ | Low | +| **Hume** | ✅ | Estimated | ❌ | Low | +| **xAI** | ✅ | Estimated | ❌ | Low | +| **Fish Audio** | ✅ | Estimated | ❌ | Low | +| **Mistral** | ✅ | Estimated | ❌ | Low | +| **Murf** | ✅ | Estimated | ❌ | Low | +| **Unreal Speech** | ✅ | Estimated | ❌ | Low | +| **Resemble** | ✅ | Estimated | ❌ | Low | **Character-Level Timing**: Only ElevenLabs provides precise character-level timing data via the `/with-timestamps` endpoint, enabling the most accurate word highlighting and speech synchronization. @@ -253,7 +271,7 @@ async function runExample() { runExample().catch(console.error); ``` -The factory supports all engines: `'azure'`, `'google'`, `'polly'`, `'elevenlabs'`, `'openai'`, `'modelslab'`, `'playht'`, `'watson'`, `'witai'`, `'sherpaonnx'`, `'sherpaonnx-wasm'`, `'espeak'`, `'espeak-wasm'`, `'sapi'`, etc. +The factory supports all engines: `'azure'`, `'google'`, `'polly'`, `'elevenlabs'`, `'openai'`, `'modelslab'`, `'playht'`, `'watson'`, `'witai'`, `'sherpaonnx'`, `'sherpaonnx-wasm'`, `'espeak'`, `'espeak-wasm'`, `'sapi'`, `'cartesia'`, `'deepgram'`, `'hume'`, `'xai'`, `'fishaudio'`, `'mistral'`, `'murf'`, `'unrealspeech'`, `'resemble'`, etc. ## Core Functionality @@ -471,6 +489,15 @@ The following engines **automatically strip SSML tags** and convert to plain tex - **PlayHT** - SSML tags are removed, plain text is synthesized - **ModelsLab** - SSML tags are removed, plain text is synthesized - **SherpaOnnx/SherpaOnnx-WASM** - SSML tags are removed, plain text is synthesized +- **Cartesia** - SSML tags removed; audio tags (`[laugh]`, `[sigh]`, etc.) mapped to `` for sonic-3, stripped for others +- **Deepgram** - SSML tags are removed, plain text is synthesized +- **Hume** - SSML tags are removed, plain text is synthesized +- **xAI** - SSML tags are removed; audio tags passed natively for grok-tts +- **Fish Audio** - SSML tags removed; audio tags passed natively for s2-pro +- **Mistral** - SSML tags are removed, plain text is synthesized +- **Murf** - SSML tags are removed, plain text is synthesized +- **Unreal Speech** - SSML tags are removed, plain text is synthesized +- **Resemble** - SSML tags are removed, plain text is synthesized ### Usage Examples @@ -667,6 +694,15 @@ When disabled, js-tts-wrapper falls back to the lightweight built-in converter ( | OpenAI | ✅ Converted | → SSML → Plain text | | PlayHT | ✅ Converted | → SSML → Plain text | | SherpaOnnx | ✅ Converted | → SSML → Plain text | +| Cartesia | ✅ Converted | → SSML → Plain text | +| Deepgram | ✅ Converted | → SSML → Plain text | +| Hume | ✅ Converted | → SSML → Plain text | +| xAI | ✅ Converted | → SSML → Plain text | +| Fish Audio | ✅ Converted | → SSML → Plain text | +| Mistral | ✅ Converted | → SSML → Plain text | +| Murf | ✅ Converted | → SSML → Plain text | +| Unreal Speech | ✅ Converted | → SSML → Plain text | +| Resemble | ✅ Converted | → SSML → Plain text | ### Speech Markdown vs Raw SSML: When to Use Each @@ -1069,6 +1105,112 @@ await tts.speak('Hello from Windows SAPI!'); > **Note**: This engine is **Windows-only** +### Cartesia + +```javascript +import { CartesiaTTSClient } from 'js-tts-wrapper'; + +const tts = new CartesiaTTSClient({ apiKey: 'your-api-key' }); +await tts.setVoice('sonic-3'); // or 'sonic-2' +await tts.speak('Hello from Cartesia!'); +``` + +> Audio tags like `[laugh]`, `[sigh]` are mapped to `` SSML for sonic-3, stripped for other models. + +### Deepgram + +```javascript +import { DeepgramTTSClient } from 'js-tts-wrapper'; + +const tts = new DeepgramTTSClient({ apiKey: 'your-api-key' }); +await tts.setVoice('aura-2-asteria-en'); +await tts.speak('Hello from Deepgram!'); +``` + +> Uses a static voice list. Model and voice are combined in the URL parameter. + +### Hume AI + +```javascript +import { HumeTTSClient } from 'js-tts-wrapper'; + +const tts = new HumeTTSClient({ apiKey: 'your-api-key' }); +await tts.setVoice('ito'); // or any Hume voice name +await tts.speak('Hello from Hume!'); +``` + +> Supports `octave-2` and `octave-1` models. Streaming uses a separate `/tts/stream/file` endpoint. + +### xAI (Grok) + +```javascript +import { XAITTSClient } from 'js-tts-wrapper'; + +const tts = new XAITTSClient({ apiKey: 'your-api-key' }); +await tts.speak('Hello from xAI!'); +``` + +> Native audio tag passthrough for grok-tts model. Language can be configured via properties. + +### Fish Audio + +```javascript +import { FishAudioTTSClient } from 'js-tts-wrapper'; + +const tts = new FishAudioTTSClient({ apiKey: 'your-api-key' }); +await tts.setVoice('your-voice-reference-id'); +await tts.speak('Hello from Fish Audio!'); +``` + +> Model ID is passed as a header. Audio tags passed natively for s2-pro model. + +### Mistral + +```javascript +import { MistralTTSClient } from 'js-tts-wrapper'; + +const tts = new MistralTTSClient({ apiKey: 'your-api-key' }); +await tts.speak('Hello from Mistral!'); +``` + +> Uses SSE streaming with base64 audio chunks. Non-streaming returns base64 JSON. + +### Murf + +```javascript +import { MurfTTSClient } from 'js-tts-wrapper'; + +const tts = new MurfTTSClient({ apiKey: 'your-api-key' }); +await tts.setVoice('en-US-natalie'); +await tts.speak('Hello from Murf!'); +``` + +> Two models: GEN2 (base64 response) and FALCON (binary streaming). Uses static voice list. + +### Unreal Speech + +```javascript +import { UnrealSpeechTTSClient } from 'js-tts-wrapper'; + +const tts = new UnrealSpeechTTSClient({ apiKey: 'your-api-key' }); +await tts.setVoice('Scarlett'); +await tts.speak('Hello from Unreal Speech!'); +``` + +> Non-streaming uses two-step URI-based flow. Streaming returns audio directly. + +### Resemble + +```javascript +import { ResembleTTSClient } from 'js-tts-wrapper'; + +const tts = new ResembleTTSClient({ apiKey: 'your-api-key' }); +await tts.setVoice('your-voice-id'); +await tts.speak('Hello from Resemble!'); +``` + +> Non-streaming returns base64 JSON. Streaming returns raw binary audio. + ## API Reference ### Factory Function From f53cc07230d09819a88a3e2a080bf49d4054e4b2 Mon Sep 17 00:00:00 2001 From: will wade Date: Wed, 8 Apr 2026 14:13:21 +0100 Subject: [PATCH 3/7] fix: audit fixes for all new engines + pre-existing build/lint issues - Replace Buffer.from() in ElevenLabs with cross-env base64ToUint8Array - Replace atob() in Mistral/Murf/Resemble with shared utility - Add shared src/utils/base64-utils.ts for cross-env base64 decoding - Fix Fish Audio checkCredentials to use GET /v1/model (no quota consumed) - Fix Resemble checkCredentials to use GET /v2/voices (no quota consumed) - Add voice discovery: Hume (16 static voices), Mistral (27 static voices) - Add Resemble voice listing via API - Fix xAI: remove dead processAudioTags no-op code - Fix pre-existing lint issues in google.ts, polly.ts, playht.ts, openai.ts, sherpaonnx.ts, abstract-tts.ts, azure.ts --- src/__tests__/audio-input.test.ts | 201 ++++++++++------ src/__tests__/azure-mstts-namespace.test.ts | 7 +- src/__tests__/cartesia-compliance.test.ts | 2 +- src/__tests__/cartesia.test.ts | 2 +- src/__tests__/credential-validation.test.ts | 50 ++-- src/__tests__/deepgram-compliance.test.ts | 6 +- src/__tests__/deepgram.test.ts | 2 +- src/__tests__/elevenlabs-audio-tags.test.ts | 6 +- src/__tests__/fishaudio.test.ts | 2 +- src/__tests__/hume.test.ts | 2 +- src/__tests__/mistral.test.ts | 45 +++- src/__tests__/mock-tts-client.helper.ts | 20 +- src/__tests__/murf.test.ts | 60 +++-- src/__tests__/openai.test.ts | 94 ++++---- src/__tests__/polly-ssml.test.ts | 95 ++++---- src/__tests__/resemble.test.ts | 44 +++- src/__tests__/sapi-ssml-handling.test.ts | 98 ++++---- src/__tests__/sapi-voice-selection.test.ts | 44 ++-- src/__tests__/sherpa-optional-import.test.ts | 11 +- .../speech-markdown-converter.test.ts | 15 +- src/__tests__/ssml-compatibility.test.ts | 126 +++++----- src/__tests__/ssml-comprehensive.test.ts | 102 ++++---- src/__tests__/tts-engine.test.ts | 116 ++++++---- src/__tests__/unrealspeech.test.ts | 49 +++- src/__tests__/upliftai.test.ts | 30 +-- src/__tests__/watson.test.ts | 117 +++++----- src/__tests__/witai.test.ts | 194 ++++++++-------- src/__tests__/xai.test.ts | 2 +- src/core/abstract-tts.ts | 219 +++++++++++------- src/engines/azure.ts | 92 +++++--- src/engines/elevenlabs.ts | 7 +- src/engines/espeak.ts | 10 +- src/engines/fishaudio.ts | 7 +- src/engines/google.ts | 142 +++++++----- src/engines/hume.ts | 41 +++- src/engines/mistral.ts | 64 +++-- src/engines/modelslab.ts | 87 +++++-- src/engines/murf.ts | 7 +- src/engines/openai.ts | 23 +- src/engines/playht.ts | 5 +- src/engines/polly.ts | 126 +++++----- src/engines/resemble.ts | 45 ++-- src/engines/sapi.ts | 53 +++-- src/engines/sherpaonnx-wasm.ts | 4 +- src/engines/sherpaonnx.ts | 5 +- src/engines/upliftai.ts | 2 +- src/engines/watson.ts | 2 +- src/engines/witai.ts | 6 +- src/engines/xai.ts | 3 - src/markdown/converter-browser.ts | 2 +- src/utils/base64-utils.ts | 8 + src/utils/sherpaonnx-loader.js | 14 +- src/utils/stream-utils.ts | 3 +- 53 files changed, 1522 insertions(+), 997 deletions(-) create mode 100644 src/utils/base64-utils.ts diff --git a/src/__tests__/audio-input.test.ts b/src/__tests__/audio-input.test.ts index ffc370d..41eb6a3 100644 --- a/src/__tests__/audio-input.test.ts +++ b/src/__tests__/audio-input.test.ts @@ -2,157 +2,220 @@ * Tests for the new audio input functionality */ -import { validateSpeakInput, getAudioFormatFromFilename, detectAudioFormat, processAudioInput } from '../utils/audio-input'; -import type { SpeakInput } from '../types'; - -describe('Audio Input Utilities', () => { - describe('validateSpeakInput', () => { - it('should accept text input', () => { - const input: SpeakInput = { text: 'Hello world' }; +import type { SpeakInput } from "../types"; +import { + detectAudioFormat, + getAudioFormatFromFilename, + processAudioInput, + validateSpeakInput, +} from "../utils/audio-input"; + +describe("Audio Input Utilities", () => { + describe("validateSpeakInput", () => { + it("should accept text input", () => { + const input: SpeakInput = { text: "Hello world" }; expect(() => validateSpeakInput(input)).not.toThrow(); }); - it('should accept filename input', () => { - const input: SpeakInput = { filename: 'test.mp3' }; + it("should accept filename input", () => { + const input: SpeakInput = { filename: "test.mp3" }; expect(() => validateSpeakInput(input)).not.toThrow(); }); - it('should accept audioBytes input', () => { + it("should accept audioBytes input", () => { const input: SpeakInput = { audioBytes: new Uint8Array([1, 2, 3]) }; expect(() => validateSpeakInput(input)).not.toThrow(); }); - it('should accept audioStream input', () => { + it("should accept audioStream input", () => { const stream = new ReadableStream({ start(controller) { controller.enqueue(new Uint8Array([1, 2, 3])); controller.close(); - } + }, }); const input: SpeakInput = { audioStream: stream }; expect(() => validateSpeakInput(input)).not.toThrow(); }); - it('should throw error when no input provided', () => { + it("should throw error when no input provided", () => { const input: SpeakInput = {}; - expect(() => validateSpeakInput(input)).toThrow('No input provided'); + expect(() => validateSpeakInput(input)).toThrow("No input provided"); }); - it('should throw error when multiple inputs provided', () => { - const input: SpeakInput = { - text: 'Hello', - filename: 'test.mp3' + it("should throw error when multiple inputs provided", () => { + const input: SpeakInput = { + text: "Hello", + filename: "test.mp3", }; - expect(() => validateSpeakInput(input)).toThrow('Multiple input sources provided'); + expect(() => validateSpeakInput(input)).toThrow("Multiple input sources provided"); }); }); - describe('getAudioFormatFromFilename', () => { - it('should detect MP3 format', () => { - expect(getAudioFormatFromFilename('test.mp3')).toBe('audio/mpeg'); - expect(getAudioFormatFromFilename('TEST.MP3')).toBe('audio/mpeg'); + describe("getAudioFormatFromFilename", () => { + it("should detect MP3 format", () => { + expect(getAudioFormatFromFilename("test.mp3")).toBe("audio/mpeg"); + expect(getAudioFormatFromFilename("TEST.MP3")).toBe("audio/mpeg"); }); - it('should detect WAV format', () => { - expect(getAudioFormatFromFilename('test.wav')).toBe('audio/wav'); + it("should detect WAV format", () => { + expect(getAudioFormatFromFilename("test.wav")).toBe("audio/wav"); }); - it('should detect OGG format', () => { - expect(getAudioFormatFromFilename('test.ogg')).toBe('audio/ogg'); + it("should detect OGG format", () => { + expect(getAudioFormatFromFilename("test.ogg")).toBe("audio/ogg"); }); - it('should default to WAV for unknown extensions', () => { - expect(getAudioFormatFromFilename('test.unknown')).toBe('audio/wav'); - expect(getAudioFormatFromFilename('test')).toBe('audio/wav'); + it("should default to WAV for unknown extensions", () => { + expect(getAudioFormatFromFilename("test.unknown")).toBe("audio/wav"); + expect(getAudioFormatFromFilename("test")).toBe("audio/wav"); }); }); - describe('detectAudioFormat', () => { - it('should detect WAV format from header', () => { + describe("detectAudioFormat", () => { + it("should detect WAV format from header", () => { // WAV header: RIFF....WAVE const wavHeader = new Uint8Array([ - 0x52, 0x49, 0x46, 0x46, // RIFF - 0x00, 0x00, 0x00, 0x00, // file size - 0x57, 0x41, 0x56, 0x45 // WAVE + 0x52, + 0x49, + 0x46, + 0x46, // RIFF + 0x00, + 0x00, + 0x00, + 0x00, // file size + 0x57, + 0x41, + 0x56, + 0x45, // WAVE ]); - expect(detectAudioFormat(wavHeader)).toBe('audio/wav'); + expect(detectAudioFormat(wavHeader)).toBe("audio/wav"); }); - it('should detect MP3 format from ID3 header', () => { + it("should detect MP3 format from ID3 header", () => { // ID3 header const mp3Header = new Uint8Array([ - 0x49, 0x44, 0x33, // ID3 - 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + 0x49, + 0x44, + 0x33, // ID3 + 0x03, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, ]); - expect(detectAudioFormat(mp3Header)).toBe('audio/mpeg'); + expect(detectAudioFormat(mp3Header)).toBe("audio/mpeg"); }); - it('should detect MP3 format from MPEG frame sync', () => { + it("should detect MP3 format from MPEG frame sync", () => { // MPEG frame sync const mp3Header = new Uint8Array([ - 0xFF, 0xFB, // MPEG frame sync - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + 0xff, + 0xfb, // MPEG frame sync + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, ]); - expect(detectAudioFormat(mp3Header)).toBe('audio/mpeg'); + expect(detectAudioFormat(mp3Header)).toBe("audio/mpeg"); }); - it('should detect OGG format from header', () => { + it("should detect OGG format from header", () => { // OGG header: OggS const oggHeader = new Uint8Array([ - 0x4F, 0x67, 0x67, 0x53, // OggS - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + 0x4f, + 0x67, + 0x67, + 0x53, // OggS + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, ]); - expect(detectAudioFormat(oggHeader)).toBe('audio/ogg'); + expect(detectAudioFormat(oggHeader)).toBe("audio/ogg"); }); - it('should default to WAV for unknown formats', () => { + it("should default to WAV for unknown formats", () => { const unknownHeader = new Uint8Array([0x00, 0x01, 0x02, 0x03]); - expect(detectAudioFormat(unknownHeader)).toBe('audio/wav'); + expect(detectAudioFormat(unknownHeader)).toBe("audio/wav"); }); - it('should default to WAV for short arrays', () => { + it("should default to WAV for short arrays", () => { const shortArray = new Uint8Array([0x00, 0x01]); - expect(detectAudioFormat(shortArray)).toBe('audio/wav'); + expect(detectAudioFormat(shortArray)).toBe("audio/wav"); }); }); - describe('processAudioInput', () => { - it('should process audioBytes input', async () => { + describe("processAudioInput", () => { + it("should process audioBytes input", async () => { const audioBytes = new Uint8Array([ - 0x52, 0x49, 0x46, 0x46, // RIFF - 0x00, 0x00, 0x00, 0x00, // file size - 0x57, 0x41, 0x56, 0x45 // WAVE + 0x52, + 0x49, + 0x46, + 0x46, // RIFF + 0x00, + 0x00, + 0x00, + 0x00, // file size + 0x57, + 0x41, + 0x56, + 0x45, // WAVE ]); const input: SpeakInput = { audioBytes }; - + const result = await processAudioInput(input); expect(result.audioBytes).toBe(audioBytes); - expect(result.mimeType).toBe('audio/wav'); + expect(result.mimeType).toBe("audio/wav"); }); - it('should process audioStream input', async () => { + it("should process audioStream input", async () => { const testData = new Uint8Array([ - 0x49, 0x44, 0x33, // ID3 - 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + 0x49, + 0x44, + 0x33, // ID3 + 0x03, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, + 0x00, ]); - + const stream = new ReadableStream({ start(controller) { controller.enqueue(testData); controller.close(); - } + }, }); - + const input: SpeakInput = { audioStream: stream }; - + const result = await processAudioInput(input); expect(result.audioBytes).toEqual(testData); - expect(result.mimeType).toBe('audio/mpeg'); + expect(result.mimeType).toBe("audio/mpeg"); }); - it('should throw error for invalid input', async () => { + it("should throw error for invalid input", async () => { const input: SpeakInput = {}; - await expect(processAudioInput(input)).rejects.toThrow('No input provided'); + await expect(processAudioInput(input)).rejects.toThrow("No input provided"); }); }); }); diff --git a/src/__tests__/azure-mstts-namespace.test.ts b/src/__tests__/azure-mstts-namespace.test.ts index 1282b63..e49a19d 100644 --- a/src/__tests__/azure-mstts-namespace.test.ts +++ b/src/__tests__/azure-mstts-namespace.test.ts @@ -23,7 +23,7 @@ describe("Azure MSTTS Namespace Handling", () => { expect(result).toContain('xmlns:mstts="https://www.w3.org/2001/mstts"'); expect(result).toContain('xmlns="http://www.w3.org/2001/10/synthesis"'); expect(result).toContain('version="1.0"'); - expect(result).toContain('xml:lang='); + expect(result).toContain("xml:lang="); }); it("should not add mstts namespace when SSML does not contain mstts tags", async () => { @@ -33,7 +33,7 @@ describe("Azure MSTTS Namespace Handling", () => { const result = (client as any).ensureAzureSSMLStructure(ssml, "en-US-AriaNeural"); - expect(result).not.toContain('xmlns:mstts='); + expect(result).not.toContain("xmlns:mstts="); expect(result).toContain('xmlns="http://www.w3.org/2001/10/synthesis"'); }); @@ -110,7 +110,7 @@ describe("Azure MSTTS Namespace Handling", () => { expect(result).toContain('version="1.0"'); expect(result).toContain('xmlns="http://www.w3.org/2001/10/synthesis"'); - expect(result).toContain('xml:lang='); + expect(result).toContain("xml:lang="); }); it("should not duplicate attributes", async () => { @@ -130,4 +130,3 @@ describe("Azure MSTTS Namespace Handling", () => { }); }); }); - diff --git a/src/__tests__/cartesia-compliance.test.ts b/src/__tests__/cartesia-compliance.test.ts index 073daf2..16e1b57 100644 --- a/src/__tests__/cartesia-compliance.test.ts +++ b/src/__tests__/cartesia-compliance.test.ts @@ -1,4 +1,4 @@ -import { describe, it, expect, jest, beforeEach } from "@jest/globals"; +import { beforeEach, describe, expect, it, jest } from "@jest/globals"; import { CartesiaTTSClient } from "../engines/cartesia"; import { createTTSClient } from "../factory"; diff --git a/src/__tests__/cartesia.test.ts b/src/__tests__/cartesia.test.ts index c490c3b..669d7f0 100644 --- a/src/__tests__/cartesia.test.ts +++ b/src/__tests__/cartesia.test.ts @@ -1,4 +1,4 @@ -import { describe, it, expect, jest, beforeEach } from "@jest/globals"; +import { beforeEach, describe, expect, it } from "@jest/globals"; import { CartesiaTTSClient } from "../engines/cartesia"; describe("CartesiaTTSClient", () => { diff --git a/src/__tests__/credential-validation.test.ts b/src/__tests__/credential-validation.test.ts index d682b16..73a6487 100644 --- a/src/__tests__/credential-validation.test.ts +++ b/src/__tests__/credential-validation.test.ts @@ -1,8 +1,8 @@ -import { describe, it, expect } from "@jest/globals"; -import { createTTSClient } from "../factory"; +import { describe, expect, it } from "@jest/globals"; +import { AzureTTSClient } from "../engines/azure"; import { EspeakBrowserTTSClient } from "../engines/espeak-wasm"; import { OpenAITTSClient } from "../engines/openai"; -import { AzureTTSClient } from "../engines/azure"; +import { createTTSClient } from "../factory"; /** * Test suite for credential validation functionality @@ -15,24 +15,24 @@ describe("Credential Validation", () => { expect(status).toMatchObject({ valid: true, - engine: expect.stringContaining('espeak'), + engine: expect.stringContaining("espeak"), environment: expect.stringMatching(/^(browser|node)$/), requiresCredentials: false, credentialTypes: [], - message: expect.stringContaining('credentials are valid') + message: expect.stringContaining("credentials are valid"), }); }); it("should return proper status structure for engines requiring credentials", async () => { - const tts = new AzureTTSClient({ subscriptionKey: '', region: '' }); + const tts = new AzureTTSClient({ subscriptionKey: "", region: "" }); const status = await tts.getCredentialStatus(); expect(status).toMatchObject({ - engine: expect.stringContaining('azure'), + engine: expect.stringContaining("azure"), environment: expect.stringMatching(/^(browser|node)$/), requiresCredentials: true, - credentialTypes: ['subscriptionKey', 'region'], - message: expect.any(String) + credentialTypes: ["subscriptionKey", "region"], + message: expect.any(String), }); // Should be invalid with empty credentials @@ -51,11 +51,11 @@ describe("Credential Validation", () => { it("should return correct credential types for cloud engines", async () => { const openaiTTS = new OpenAITTSClient({}); const openaiCreds = (openaiTTS as any).getRequiredCredentials(); - expect(openaiCreds).toEqual(['apiKey']); + expect(openaiCreds).toEqual(["apiKey"]); - const azureTTS = new AzureTTSClient({ subscriptionKey: '', region: '' }); + const azureTTS = new AzureTTSClient({ subscriptionKey: "", region: "" }); const azureCreds = (azureTTS as any).getRequiredCredentials(); - expect(azureCreds).toEqual(['subscriptionKey', 'region']); + expect(azureCreds).toEqual(["subscriptionKey", "region"]); }); }); @@ -67,13 +67,13 @@ describe("Credential Validation", () => { }); it("should return false for engines with invalid credentials", async () => { - const tts = new OpenAITTSClient({ apiKey: 'fake-key' }); + const tts = new OpenAITTSClient({ apiKey: "fake-key" }); const isValid = await tts.checkCredentials(); expect(isValid).toBe(false); }); it("should return false for engines with missing credentials", async () => { - const tts = new AzureTTSClient({ subscriptionKey: '', region: '' }); + const tts = new AzureTTSClient({ subscriptionKey: "", region: "" }); const isValid = await tts.checkCredentials(); expect(isValid).toBe(false); }); @@ -81,24 +81,24 @@ describe("Credential Validation", () => { describe("Factory integration", () => { it("should work with factory-created clients", async () => { - const tts = createTTSClient('espeak-wasm', {}); + const tts = createTTSClient("espeak-wasm", {}); const status = await tts.getCredentialStatus(); - + expect(status.valid).toBe(true); expect(status.requiresCredentials).toBe(false); }); it("should handle invalid credentials gracefully", async () => { - const tts = createTTSClient('openai', { apiKey: 'fake-key' }); - + const tts = createTTSClient("openai", { apiKey: "fake-key" }); + // Should not throw, but return false const isValid = await tts.checkCredentials(); expect(isValid).toBe(false); - + // Status should provide detailed information const status = await tts.getCredentialStatus(); expect(status.valid).toBe(false); - expect(status.credentialTypes).toContain('apiKey'); + expect(status.credentialTypes).toContain("apiKey"); }); }); @@ -106,22 +106,22 @@ describe("Credential Validation", () => { it("should correctly detect environment", async () => { const tts = new EspeakBrowserTTSClient({}); const status = await tts.getCredentialStatus(); - + // Should detect Node.js environment in test - expect(status.environment).toBe('node'); + expect(status.environment).toBe("node"); }); }); describe("Error handling", () => { it("should handle errors gracefully in getCredentialStatus", async () => { // Create a client that will have invalid credentials - const tts = new AzureTTSClient({ subscriptionKey: '', region: '' }); + const tts = new AzureTTSClient({ subscriptionKey: "", region: "" }); const status = await tts.getCredentialStatus(); expect(status.valid).toBe(false); - expect(status.message).toContain('credentials are invalid'); - expect(status.credentialTypes).toEqual(['subscriptionKey', 'region']); + expect(status.message).toContain("credentials are invalid"); + expect(status.credentialTypes).toEqual(["subscriptionKey", "region"]); }); }); }); diff --git a/src/__tests__/deepgram-compliance.test.ts b/src/__tests__/deepgram-compliance.test.ts index cf0e069..466729c 100644 --- a/src/__tests__/deepgram-compliance.test.ts +++ b/src/__tests__/deepgram-compliance.test.ts @@ -1,4 +1,4 @@ -import { describe, it, expect, jest, beforeEach } from "@jest/globals"; +import { beforeEach, describe, expect, it, jest } from "@jest/globals"; import { DeepgramTTSClient } from "../engines/deepgram"; import { createTTSClient } from "../factory"; @@ -102,9 +102,7 @@ describe("DeepgramTTSClient — Unified API compliance", () => { const voices = await client.getVoicesByLanguage("en-US"); expect(voices.length).toBeGreaterThan(0); for (const v of voices) { - expect( - v.languageCodes.some((l) => l.bcp47 === "en-US" || l.iso639_3 === "eng") - ).toBe(true); + expect(v.languageCodes.some((l) => l.bcp47 === "en-US" || l.iso639_3 === "eng")).toBe(true); } }); }); diff --git a/src/__tests__/deepgram.test.ts b/src/__tests__/deepgram.test.ts index 1788530..15c4ec8 100644 --- a/src/__tests__/deepgram.test.ts +++ b/src/__tests__/deepgram.test.ts @@ -1,4 +1,4 @@ -import { describe, it, expect, beforeEach } from "@jest/globals"; +import { beforeEach, describe, expect, it } from "@jest/globals"; import { DeepgramTTSClient } from "../engines/deepgram"; describe("DeepgramTTSClient", () => { diff --git a/src/__tests__/elevenlabs-audio-tags.test.ts b/src/__tests__/elevenlabs-audio-tags.test.ts index b3dd705..3be4013 100644 --- a/src/__tests__/elevenlabs-audio-tags.test.ts +++ b/src/__tests__/elevenlabs-audio-tags.test.ts @@ -1,4 +1,4 @@ -import { describe, it, expect, beforeEach } from "@jest/globals"; +import { beforeEach, describe, expect, it } from "@jest/globals"; import { ElevenLabsTTSClient } from "../engines/elevenlabs"; describe("ElevenLabs Audio Tag Support", () => { @@ -29,7 +29,9 @@ describe("ElevenLabs Audio Tag Support", () => { it("should pass per-request model override for v3 audio tags", () => { client.setProperty("model", "eleven_multilingual_v2"); - const result = (client as any).processAudioTags("Hello [laugh] world", { model: "eleven_v3" }); + const result = (client as any).processAudioTags("Hello [laugh] world", { + model: "eleven_v3", + }); expect(result).toBe("Hello [laugh] world"); }); }); diff --git a/src/__tests__/fishaudio.test.ts b/src/__tests__/fishaudio.test.ts index f005cc9..f8577d8 100644 --- a/src/__tests__/fishaudio.test.ts +++ b/src/__tests__/fishaudio.test.ts @@ -1,4 +1,4 @@ -import { describe, it, expect, jest, beforeEach } from "@jest/globals"; +import { beforeEach, describe, expect, it, jest } from "@jest/globals"; import { FishAudioTTSClient } from "../engines/fishaudio"; import { createTTSClient } from "../factory"; diff --git a/src/__tests__/hume.test.ts b/src/__tests__/hume.test.ts index 2e4b5ca..3dc55b4 100644 --- a/src/__tests__/hume.test.ts +++ b/src/__tests__/hume.test.ts @@ -1,4 +1,4 @@ -import { describe, it, expect, jest, beforeEach } from "@jest/globals"; +import { beforeEach, describe, expect, it, jest } from "@jest/globals"; import { HumeTTSClient } from "../engines/hume"; import { createTTSClient } from "../factory"; diff --git a/src/__tests__/mistral.test.ts b/src/__tests__/mistral.test.ts index cfb9686..0e02ffc 100644 --- a/src/__tests__/mistral.test.ts +++ b/src/__tests__/mistral.test.ts @@ -1,20 +1,45 @@ -import { describe, it, expect, jest, beforeEach } from "@jest/globals"; +import { beforeEach, describe, expect, it, jest } from "@jest/globals"; import { MistralTTSClient } from "../engines/mistral"; import { createTTSClient } from "../factory"; describe("MistralTTSClient", () => { let client: MistralTTSClient; - beforeEach(() => { client = new MistralTTSClient({ apiKey: "test" }); }); + beforeEach(() => { + client = new MistralTTSClient({ apiKey: "test" }); + }); it("initializes with defaults", () => { expect(client.getProperty("model")).toBe("voxtral-mini-tts-2603"); }); - it("sets model", () => { client.setProperty("model", "other"); expect(client.getProperty("model")).toBe("other"); }); - it("sets voice via voiceId", () => { client.setVoice("v1"); expect(client.getProperty("voice")).toBe("v1"); }); - it("checks credentials without key", async () => { expect(await new MistralTTSClient({}).checkCredentials()).toBe(false); }); - it("creates via factory", () => { expect(createTTSClient("mistral", { apiKey: "t" })).toBeInstanceOf(MistralTTSClient); }); - it("strips SSML", async () => { expect(await (client as any).prepareText("Hi")).toBe("Hi"); }); - it("creates word timings", () => { (client as any)._createEstimatedWordTimings("a b c"); expect((client as any).timings.length).toBe(3); }); - it("supports events", () => { const fn = jest.fn(); client.on("end", fn); (client as any).emit("end"); expect(fn).toHaveBeenCalled(); }); - it("credential status", async () => { const s = await client.getCredentialStatus(); expect(s.engine).toBe("mistral"); }); + it("sets model", () => { + client.setProperty("model", "other"); + expect(client.getProperty("model")).toBe("other"); + }); + it("sets voice via voiceId", () => { + client.setVoice("v1"); + expect(client.getProperty("voice")).toBe("v1"); + }); + it("checks credentials without key", async () => { + expect(await new MistralTTSClient({}).checkCredentials()).toBe(false); + }); + it("creates via factory", () => { + expect(createTTSClient("mistral", { apiKey: "t" })).toBeInstanceOf(MistralTTSClient); + }); + it("strips SSML", async () => { + expect(await (client as any).prepareText("Hi")).toBe("Hi"); + }); + it("creates word timings", () => { + (client as any)._createEstimatedWordTimings("a b c"); + expect((client as any).timings.length).toBe(3); + }); + it("supports events", () => { + const fn = jest.fn(); + client.on("end", fn); + (client as any).emit("end"); + expect(fn).toHaveBeenCalled(); + }); + it("credential status", async () => { + const s = await client.getCredentialStatus(); + expect(s.engine).toBe("mistral"); + }); }); diff --git a/src/__tests__/mock-tts-client.helper.ts b/src/__tests__/mock-tts-client.helper.ts index 0241753..6430967 100644 --- a/src/__tests__/mock-tts-client.helper.ts +++ b/src/__tests__/mock-tts-client.helper.ts @@ -1,5 +1,5 @@ import { AbstractTTSClient } from "../core/abstract-tts"; -import type { SpeakOptions, UnifiedVoice, WordBoundaryCallback, WordBoundary } from "../types"; +import type { SpeakOptions, UnifiedVoice, WordBoundary, WordBoundaryCallback } from "../types"; /** * Mock TTS client for testing @@ -36,10 +36,9 @@ export class MockTTSClient extends AbstractTTSClient { async synthToBytes(_text: string, _options?: SpeakOptions): Promise { // Return a small WAV file header (44 bytes) return new Uint8Array([ - 0x52, 0x49, 0x46, 0x46, 0x24, 0x00, 0x00, 0x00, 0x57, 0x41, 0x56, 0x45, - 0x66, 0x6d, 0x74, 0x20, 0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x01, 0x00, - 0x44, 0xac, 0x00, 0x00, 0x88, 0x58, 0x01, 0x00, 0x02, 0x00, 0x10, 0x00, - 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, + 0x52, 0x49, 0x46, 0x46, 0x24, 0x00, 0x00, 0x00, 0x57, 0x41, 0x56, 0x45, 0x66, 0x6d, 0x74, + 0x20, 0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x01, 0x00, 0x44, 0xac, 0x00, 0x00, 0x88, 0x58, + 0x01, 0x00, 0x02, 0x00, 0x10, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, ]); } @@ -49,7 +48,10 @@ export class MockTTSClient extends AbstractTTSClient { * @param options Synthesis options. * @returns Promise resolving to an object containing the audio stream and word boundaries. */ - async synthToBytestream(text: string, options?: SpeakOptions): Promise<{ + async synthToBytestream( + text: string, + options?: SpeakOptions + ): Promise<{ audioStream: ReadableStream; wordBoundaries: WordBoundary[]; }> { @@ -67,9 +69,9 @@ export class MockTTSClient extends AbstractTTSClient { if (options?.useWordBoundary) { // Create mock word boundaries const wordBoundaries: WordBoundary[] = [ - { text: 'Mock', offset: 0, duration: 500 }, - { text: 'boundary', offset: 500, duration: 500 }, - { text: 'test.', offset: 1000, duration: 500 } + { text: "Mock", offset: 0, duration: 500 }, + { text: "boundary", offset: 500, duration: 500 }, + { text: "test.", offset: 1000, duration: 500 }, ]; return { diff --git a/src/__tests__/murf.test.ts b/src/__tests__/murf.test.ts index 16bbd29..58e15ef 100644 --- a/src/__tests__/murf.test.ts +++ b/src/__tests__/murf.test.ts @@ -1,20 +1,54 @@ -import { describe, it, expect, jest, beforeEach } from "@jest/globals"; +import { beforeEach, describe, expect, it, jest } from "@jest/globals"; import { MurfTTSClient } from "../engines/murf"; import { createTTSClient } from "../factory"; describe("MurfTTSClient", () => { let client: MurfTTSClient; - beforeEach(() => { client = new MurfTTSClient({ apiKey: "test" }); }); + beforeEach(() => { + client = new MurfTTSClient({ apiKey: "test" }); + }); - it("initializes with defaults", () => { expect(client.getProperty("model")).toBe("GEN2"); }); - it("sets model to FALCON", () => { client.setProperty("model", "FALCON"); expect(client.getProperty("model")).toBe("FALCON"); }); - it("sets voice via voiceId", () => { client.setVoice("en-US-owen"); expect(client.getProperty("voice")).toBe("en-US-owen"); }); - it("checks credentials without key", async () => { expect(await new MurfTTSClient({}).checkCredentials()).toBe(false); }); - it("creates via factory", () => { expect(createTTSClient("murf", { apiKey: "t" })).toBeInstanceOf(MurfTTSClient); }); - it("gets voices", async () => { const v = await client.getVoices(); expect(v.length).toBeGreaterThan(0); expect(v[0].provider).toBe("murf"); }); - it("filters by language", async () => { const v = await client.getVoicesByLanguage("en"); expect(v.length).toBeGreaterThan(0); }); - it("strips SSML", async () => { expect(await (client as any).prepareText("Hi")).toBe("Hi"); }); - it("creates word timings", () => { (client as any)._createEstimatedWordTimings("a b"); expect((client as any).timings.length).toBe(2); }); - it("supports events", () => { const fn = jest.fn(); client.on("start", fn); (client as any).emit("start"); expect(fn).toHaveBeenCalled(); }); - it("credential status", async () => { const s = await client.getCredentialStatus(); expect(s.engine).toBe("murf"); }); + it("initializes with defaults", () => { + expect(client.getProperty("model")).toBe("GEN2"); + }); + it("sets model to FALCON", () => { + client.setProperty("model", "FALCON"); + expect(client.getProperty("model")).toBe("FALCON"); + }); + it("sets voice via voiceId", () => { + client.setVoice("en-US-owen"); + expect(client.getProperty("voice")).toBe("en-US-owen"); + }); + it("checks credentials without key", async () => { + expect(await new MurfTTSClient({}).checkCredentials()).toBe(false); + }); + it("creates via factory", () => { + expect(createTTSClient("murf", { apiKey: "t" })).toBeInstanceOf(MurfTTSClient); + }); + it("gets voices", async () => { + const v = await client.getVoices(); + expect(v.length).toBeGreaterThan(0); + expect(v[0].provider).toBe("murf"); + }); + it("filters by language", async () => { + const v = await client.getVoicesByLanguage("en"); + expect(v.length).toBeGreaterThan(0); + }); + it("strips SSML", async () => { + expect(await (client as any).prepareText("Hi")).toBe("Hi"); + }); + it("creates word timings", () => { + (client as any)._createEstimatedWordTimings("a b"); + expect((client as any).timings.length).toBe(2); + }); + it("supports events", () => { + const fn = jest.fn(); + client.on("start", fn); + (client as any).emit("start"); + expect(fn).toHaveBeenCalled(); + }); + it("credential status", async () => { + const s = await client.getCredentialStatus(); + expect(s.engine).toBe("murf"); + }); }); diff --git a/src/__tests__/openai.test.ts b/src/__tests__/openai.test.ts index 16bf854..2b245d5 100644 --- a/src/__tests__/openai.test.ts +++ b/src/__tests__/openai.test.ts @@ -1,18 +1,18 @@ -import { describe, it, expect, jest, beforeEach, test, afterEach } from '@jest/globals'; -import * as fs from "node:fs"; +import * as fs from "node:fs"; import * as path from "node:path"; -import mock from 'mock-fs'; -import type { SpeechCreateParams } from 'openai/resources/audio/speech'; -import type { Response } from 'openai/core'; -import { OpenAITTSClient } from "../engines/openai"; +import { afterEach, beforeEach, describe, expect, it, jest, test } from "@jest/globals"; +import mock from "mock-fs"; +import type { Response } from "openai/core"; +import type { SpeechCreateParams } from "openai/resources/audio/speech"; +import { OpenAITTSClient } from "../engines/openai"; const nodeModulesPath = path.resolve(process.cwd(), "node_modules"); const mockListResponse = { data: [ - { id: 'tts-1', object: 'model', created: 1, owned_by: 'openai' }, - { id: 'tts-1-hd', object: 'model', created: 1, owned_by: 'openai' }, - { id: 'gpt-4o-mini-tts', object: 'model', created: 1, owned_by: 'openai' }, + { id: "tts-1", object: "model", created: 1, owned_by: "openai" }, + { id: "tts-1-hd", object: "model", created: 1, owned_by: "openai" }, + { id: "gpt-4o-mini-tts", object: "model", created: 1, owned_by: "openai" }, ], }; @@ -22,33 +22,38 @@ const mockOpenAIInstance = { }, audio: { speech: { - create: jest.fn().mockImplementation((async (params: SpeechCreateParams): Promise => { + create: jest.fn().mockImplementation((async ( + params: SpeechCreateParams + ): Promise => { const mockAudioData = Buffer.from(`mock audio for ${params.input}`); - const mockAudioBuffer = mockAudioData.buffer.slice(mockAudioData.byteOffset, mockAudioData.byteOffset + mockAudioData.byteLength); + const mockAudioBuffer = mockAudioData.buffer.slice( + mockAudioData.byteOffset, + mockAudioData.byteOffset + mockAudioData.byteLength + ); const headers = new Headers(); - if ((params.response_format as string) === 'json') { - const boundaries = [ - { word: 'Hello', start: 0.1, end: 0.5 }, - { word: 'world', start: 0.6, end: 1.0 }, - ]; - headers.set('openai-word-boundaries', JSON.stringify(boundaries)); + if ((params.response_format as string) === "json") { + const boundaries = [ + { word: "Hello", start: 0.1, end: 0.5 }, + { word: "world", start: 0.6, end: 1.0 }, + ]; + headers.set("openai-word-boundaries", JSON.stringify(boundaries)); } const mockResponse = { - ok: true, - status: 200, - statusText: 'OK', - headers: headers, - arrayBuffer: async () => mockAudioBuffer, - body: new ReadableStream({ - async start(controller) { - controller.enqueue(Buffer.from("mock stream chunk 1")); - controller.enqueue(Buffer.from("mock stream chunk 2")); - controller.close(); - } - }), - json: async () => ({}), + ok: true, + status: 200, + statusText: "OK", + headers: headers, + arrayBuffer: async () => mockAudioBuffer, + body: new ReadableStream({ + async start(controller) { + controller.enqueue(Buffer.from("mock stream chunk 1")); + controller.enqueue(Buffer.from("mock stream chunk 2")); + controller.close(); + }, + }), + json: async () => ({}), }; return mockResponse as Response; }) as any), @@ -58,7 +63,7 @@ const mockOpenAIInstance = { describe("OpenAITTSClient", () => { let client: OpenAITTSClient; - let mockStream: any; + let mockStream: any; beforeEach(() => { mock( @@ -70,7 +75,9 @@ describe("OpenAITTSClient", () => { mockStream = { write: jest.fn(), - end: jest.fn(((cb?: () => void) => { if (cb) cb(); }) as any), + end: jest.fn(((cb?: () => void) => { + if (cb) cb(); + }) as any), on: jest.fn(), once: jest.fn(), emit: jest.fn(), @@ -84,7 +91,7 @@ describe("OpenAITTSClient", () => { (client as any).client = mockOpenAIInstance; - jest.clearAllMocks(); + jest.clearAllMocks(); }); afterEach(() => { @@ -94,7 +101,7 @@ describe("OpenAITTSClient", () => { test("should initialize with default values", () => { expect(client).toBeDefined(); expect(client.getProperty("model")).toBe("tts-1"); - expect(client.getProperty("voice")).toBe("alloy"); + expect(client.getProperty("voice")).toBe("alloy"); expect(client.getProperty("instructions")).toBe(""); expect(client.getProperty("responseFormat")).toBe("mp3"); }); @@ -132,15 +139,19 @@ describe("OpenAITTSClient", () => { }); test("should convert text to speech", async () => { - const outputPath = await client.textToSpeech("Hello world", { outputFile: "openai-output.mp3" }); + const outputPath = await client.textToSpeech("Hello world", { + outputFile: "openai-output.mp3", + }); expect(outputPath).toBe("openai-output.mp3"); - expect(fs.existsSync(outputPath)).toBe(true); + expect(fs.existsSync(outputPath)).toBe(true); }); test("should convert text to speech with streaming", async () => { - const outputPath = await client.textToSpeechStreaming("Hello stream", { outputFile: "openai-streaming-output.mp3" }); + const outputPath = await client.textToSpeechStreaming("Hello stream", { + outputFile: "openai-streaming-output.mp3", + }); expect(outputPath).toBe("openai-streaming-output.mp3"); - expect(fs.existsSync(outputPath)).toBe(true); + expect(fs.existsSync(outputPath)).toBe(true); }); test("should throw error for SSML", async () => { @@ -165,8 +176,11 @@ describe("OpenAITTSClient", () => { test("should handle onEnd callback", async () => { const onEndMock = jest.fn(); const outputPath = "test.mp3"; - await client.textToSpeechStreaming("Test sentence.", { outputFile: outputPath, onEnd: onEndMock }); + await client.textToSpeechStreaming("Test sentence.", { + outputFile: outputPath, + onEnd: onEndMock, + }); expect(fs.existsSync(outputPath)).toBe(true); - expect(onEndMock).toHaveBeenCalled(); + expect(onEndMock).toHaveBeenCalled(); }); }); diff --git a/src/__tests__/polly-ssml.test.ts b/src/__tests__/polly-ssml.test.ts index d27926d..facf681 100644 --- a/src/__tests__/polly-ssml.test.ts +++ b/src/__tests__/polly-ssml.test.ts @@ -1,17 +1,17 @@ -import { describe, it, expect, beforeAll, afterAll } from "@jest/globals"; import * as fs from "node:fs"; import * as os from "node:os"; import * as path from "node:path"; +import { afterAll, beforeAll, describe, expect, it } from "@jest/globals"; import { PollyTTSClient } from "../engines/polly"; import type { UnifiedVoice } from "../types"; // Load environment variables from .env file -const envFile = path.join(process.cwd(), '.env'); +const envFile = path.join(process.cwd(), ".env"); if (fs.existsSync(envFile)) { - const envContent = fs.readFileSync(envFile, 'utf8'); - const envLines = envContent.split('\n'); + const envContent = fs.readFileSync(envFile, "utf8"); + const envLines = envContent.split("\n"); for (const line of envLines) { - if (line.trim() && !line.startsWith('#')) { + if (line.trim() && !line.startsWith("#")) { const match = line.match(/^export\s+([A-Za-z0-9_]+)="(.*)"/); if (match) { const [, key, value] = match; @@ -19,14 +19,14 @@ if (fs.existsSync(envFile)) { } } } - console.log('Environment variables loaded from .env file for Polly SSML tests'); + console.log("Environment variables loaded from .env file for Polly SSML tests"); } else { - console.log('No .env file found for Polly SSML tests'); + console.log("No .env file found for Polly SSML tests"); } /** * Polly SSML Engine Detection and Handling Tests - * + * * This test suite verifies that the Polly engine correctly: * 1. Detects voice engine types from AWS API (standard, neural, long-form, generative) * 2. Applies appropriate SSML handling based on engine capabilities @@ -50,7 +50,7 @@ describe("Polly SSML Engine Detection and Handling", () => { const credentials = { region: process.env.POLLY_REGION || "us-east-1", accessKeyId: process.env.POLLY_AWS_KEY_ID || "fake-key", - secretAccessKey: process.env.POLLY_AWS_ACCESS_KEY || "fake-secret" + secretAccessKey: process.env.POLLY_AWS_ACCESS_KEY || "fake-secret", }; client = new PollyTTSClient(credentials); @@ -68,7 +68,9 @@ describe("Polly SSML Engine Detection and Handling", () => { } } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); - console.log(`Polly: Credentials not available or invalid (${errorMessage}), skipping SSML engine tests`); + console.log( + `Polly: Credentials not available or invalid (${errorMessage}), skipping SSML engine tests` + ); runTests = false; } }); @@ -81,16 +83,18 @@ describe("Polly SSML Engine Detection and Handling", () => { } // Check that voices have metadata with supported engines - const voicesWithEngines = voices.filter(voice => - voice.metadata?.supportedEngines && voice.metadata.supportedEngines.length > 0 + const voicesWithEngines = voices.filter( + (voice) => voice.metadata?.supportedEngines && voice.metadata.supportedEngines.length > 0 ); expect(voicesWithEngines.length).toBeGreaterThan(0); - + // Log some examples for debugging const exampleVoices = voicesWithEngines.slice(0, 5); - exampleVoices.forEach(voice => { - console.log(`Voice ${voice.id}: supports engines [${voice.metadata?.supportedEngines?.join(', ')}]`); + exampleVoices.forEach((voice) => { + console.log( + `Voice ${voice.id}: supports engines [${voice.metadata?.supportedEngines?.join(", ")}]` + ); }); }); @@ -103,13 +107,13 @@ describe("Polly SSML Engine Detection and Handling", () => { const engineTypes = { standard: 0, neural: 0, - 'long-form': 0, - generative: 0 + "long-form": 0, + generative: 0, }; - voices.forEach(voice => { + voices.forEach((voice) => { const engines = voice.metadata?.supportedEngines || []; - engines.forEach(engine => { + engines.forEach((engine) => { if (engine in engineTypes) { engineTypes[engine as keyof typeof engineTypes]++; } @@ -117,7 +121,7 @@ describe("Polly SSML Engine Detection and Handling", () => { }); console.log("Engine type distribution:", engineTypes); - + // We should have at least some standard and neural voices expect(engineTypes.standard + engineTypes.neural).toBeGreaterThan(0); }); @@ -132,11 +136,11 @@ describe("Polly SSML Engine Detection and Handling", () => { // Test a few different voice types if available const testVoices = voices.slice(0, 3); - + for (const voice of testVoices) { const engines = voice.metadata?.supportedEngines || []; - console.log(`Testing voice ${voice.id} with engines: [${engines.join(', ')}]`); - + console.log(`Testing voice ${voice.id} with engines: [${engines.join(", ")}]`); + // We can't directly test the private method, but we can test the behavior // by checking if SSML is processed correctly const ssmlText = ` @@ -151,10 +155,12 @@ describe("Polly SSML Engine Detection and Handling", () => { // Set the voice and try to synthesize client.setVoice(voice.id); const audioBytes = await client.synthToBytes(ssmlText, { format: "mp3" }); - + // If we get audio bytes, the engine handled the SSML appropriately expect(audioBytes.length).toBeGreaterThan(0); - console.log(`Voice ${voice.id}: Successfully processed SSML (${audioBytes.length} bytes)`); + console.log( + `Voice ${voice.id}: Successfully processed SSML (${audioBytes.length} bytes)` + ); } catch (error) { // Some voices might not be available in the test region console.log(`Voice ${voice.id}: Synthesis failed - ${error}`); @@ -171,7 +177,7 @@ describe("Polly SSML Engine Detection and Handling", () => { } // Find a neural voice if available - const neuralVoice = voices.find(voice => + const neuralVoice = voices.find((voice) => voice.metadata?.supportedEngines?.includes("neural") ); @@ -189,10 +195,12 @@ describe("Polly SSML Engine Detection and Handling", () => { try { client.setVoice(neuralVoice.id); const audioBytes = await client.synthToBytes(ssmlWithEmphasis, { format: "mp3" }); - + // Neural voices should strip emphasis tags but still produce audio expect(audioBytes.length).toBeGreaterThan(0); - console.log(`Neural voice ${neuralVoice.id}: Successfully handled emphasis tags (${audioBytes.length} bytes)`); + console.log( + `Neural voice ${neuralVoice.id}: Successfully handled emphasis tags (${audioBytes.length} bytes)` + ); } catch (error) { console.log(`Neural voice ${neuralVoice.id}: Synthesis failed - ${error}`); } @@ -205,12 +213,13 @@ describe("Polly SSML Engine Detection and Handling", () => { } // Test both standard and neural voices if available - const standardVoice = voices.find(voice => - voice.metadata?.supportedEngines?.includes("standard") && - !voice.metadata?.supportedEngines?.includes("neural") + const standardVoice = voices.find( + (voice) => + voice.metadata?.supportedEngines?.includes("standard") && + !voice.metadata?.supportedEngines?.includes("neural") ); - - const neuralVoice = voices.find(voice => + + const neuralVoice = voices.find((voice) => voice.metadata?.supportedEngines?.includes("neural") ); @@ -226,7 +235,9 @@ describe("Polly SSML Engine Detection and Handling", () => { client.setVoice(standardVoice.id); const audioBytes = await client.synthToBytes(ssmlWithProsody, { format: "mp3" }); expect(audioBytes.length).toBeGreaterThan(0); - console.log(`Standard voice ${standardVoice.id}: Successfully processed prosody (${audioBytes.length} bytes)`); + console.log( + `Standard voice ${standardVoice.id}: Successfully processed prosody (${audioBytes.length} bytes)` + ); } catch (error) { console.log(`Standard voice ${standardVoice.id}: Synthesis failed - ${error}`); } @@ -238,7 +249,9 @@ describe("Polly SSML Engine Detection and Handling", () => { client.setVoice(neuralVoice.id); const audioBytes = await client.synthToBytes(ssmlWithProsody, { format: "mp3" }); expect(audioBytes.length).toBeGreaterThan(0); - console.log(`Neural voice ${neuralVoice.id}: Successfully processed prosody (${audioBytes.length} bytes)`); + console.log( + `Neural voice ${neuralVoice.id}: Successfully processed prosody (${audioBytes.length} bytes)` + ); } catch (error) { console.log(`Neural voice ${neuralVoice.id}: Synthesis failed - ${error}`); } @@ -254,7 +267,7 @@ describe("Polly SSML Engine Detection and Handling", () => { } // Find a voice that supports multiple engines including neural - const multiEngineVoice = voices.find(voice => { + const multiEngineVoice = voices.find((voice) => { const engines = voice.metadata?.supportedEngines || []; return engines.includes("neural") && engines.length > 1; }); @@ -264,15 +277,19 @@ describe("Polly SSML Engine Detection and Handling", () => { return; } - console.log(`Testing engine preference for voice ${multiEngineVoice.id} with engines: [${multiEngineVoice.metadata?.supportedEngines?.join(', ')}]`); - + console.log( + `Testing engine preference for voice ${multiEngineVoice.id} with engines: [${multiEngineVoice.metadata?.supportedEngines?.join(", ")}]` + ); + // The engine selection logic should prefer neural over standard // We can't directly test the private method, but we can verify synthesis works try { client.setVoice(multiEngineVoice.id); const audioBytes = await client.synthToBytes("Testing engine selection", { format: "mp3" }); expect(audioBytes.length).toBeGreaterThan(0); - console.log(`Multi-engine voice ${multiEngineVoice.id}: Successfully synthesized (${audioBytes.length} bytes)`); + console.log( + `Multi-engine voice ${multiEngineVoice.id}: Successfully synthesized (${audioBytes.length} bytes)` + ); } catch (error) { console.log(`Multi-engine voice ${multiEngineVoice.id}: Synthesis failed - ${error}`); } diff --git a/src/__tests__/resemble.test.ts b/src/__tests__/resemble.test.ts index ae0b0f4..fc618b5 100644 --- a/src/__tests__/resemble.test.ts +++ b/src/__tests__/resemble.test.ts @@ -1,17 +1,41 @@ -import { describe, it, expect, jest, beforeEach } from "@jest/globals"; +import { beforeEach, describe, expect, it, jest } from "@jest/globals"; import { ResembleTTSClient } from "../engines/resemble"; import { createTTSClient } from "../factory"; describe("ResembleTTSClient", () => { let client: ResembleTTSClient; - beforeEach(() => { client = new ResembleTTSClient({ apiKey: "test" }); }); + beforeEach(() => { + client = new ResembleTTSClient({ apiKey: "test" }); + }); - it("initializes with defaults", () => { expect(client).toBeDefined(); }); - it("sets voice via voiceId", () => { client.setVoice("uuid-123"); expect(client.getProperty("voice")).toBe("uuid-123"); }); - it("checks credentials without key", async () => { expect(await new ResembleTTSClient({}).checkCredentials()).toBe(false); }); - it("creates via factory", () => { expect(createTTSClient("resemble", { apiKey: "t" })).toBeInstanceOf(ResembleTTSClient); }); - it("strips SSML", async () => { expect(await (client as any).prepareText("Hi")).toBe("Hi"); }); - it("creates word timings", () => { (client as any)._createEstimatedWordTimings("a b"); expect((client as any).timings.length).toBe(2); }); - it("supports events", () => { const fn = jest.fn(); client.on("start", fn); (client as any).emit("start"); expect(fn).toHaveBeenCalled(); }); - it("credential status", async () => { const s = await client.getCredentialStatus(); expect(s.engine).toBe("resemble"); }); + it("initializes with defaults", () => { + expect(client).toBeDefined(); + }); + it("sets voice via voiceId", () => { + client.setVoice("uuid-123"); + expect(client.getProperty("voice")).toBe("uuid-123"); + }); + it("checks credentials without key", async () => { + expect(await new ResembleTTSClient({}).checkCredentials()).toBe(false); + }); + it("creates via factory", () => { + expect(createTTSClient("resemble", { apiKey: "t" })).toBeInstanceOf(ResembleTTSClient); + }); + it("strips SSML", async () => { + expect(await (client as any).prepareText("Hi")).toBe("Hi"); + }); + it("creates word timings", () => { + (client as any)._createEstimatedWordTimings("a b"); + expect((client as any).timings.length).toBe(2); + }); + it("supports events", () => { + const fn = jest.fn(); + client.on("start", fn); + (client as any).emit("start"); + expect(fn).toHaveBeenCalled(); + }); + it("credential status", async () => { + const s = await client.getCredentialStatus(); + expect(s.engine).toBe("resemble"); + }); }); diff --git a/src/__tests__/sapi-ssml-handling.test.ts b/src/__tests__/sapi-ssml-handling.test.ts index dbd9e5e..e66879f 100644 --- a/src/__tests__/sapi-ssml-handling.test.ts +++ b/src/__tests__/sapi-ssml-handling.test.ts @@ -1,16 +1,16 @@ -import { SAPITTSClient } from "../engines/sapi"; -import { describe, it, expect, beforeAll } from '@jest/globals'; +import { beforeAll, describe, expect, it } from "@jest/globals"; import * as os from "os"; +import { SAPITTSClient } from "../engines/sapi"; /** * SAPI SSML Handling Tests - * + * * This test suite specifically verifies that SAPI properly handles: * 1. Plain text (should be automatically wrapped in SSML) * 2. SSML with proper speak tags * 3. SSML fragments without speak tags (should be wrapped) * 4. Mixed content scenarios - * + * * The key issue being tested: SAPI should not read SSML tags literally * when they are present but not properly wrapped. */ @@ -44,28 +44,28 @@ describe("SAPI SSML Handling", () => { it.skip(`${testName} (Windows only)`, () => {}); return; } - + if (!client) { it.skip(`${testName} (SAPI not available)`, () => {}); return; } - + it(testName, testFn, 30000); // 30 second timeout for SAPI operations }; runTest("should handle plain text without reading tags literally", async () => { const plainText = "Hello world, this is a test."; - + // This should work without any issues - plain text gets wrapped in SSML const audioBytes = await client!.synthToBytes(plainText); - + expect(audioBytes).toBeInstanceOf(Uint8Array); expect(audioBytes.length).toBeGreaterThan(0); - + // Check for WAV header - const wavHeader = Buffer.from(audioBytes.slice(0, 12)).toString('ascii'); - expect(wavHeader.startsWith('RIFF')).toBe(true); - expect(wavHeader.includes('WAVE')).toBe(true); + const wavHeader = Buffer.from(audioBytes.slice(0, 12)).toString("ascii"); + expect(wavHeader.startsWith("RIFF")).toBe(true); + expect(wavHeader.includes("WAVE")).toBe(true); }); runTest("should handle proper SSML without issues", async () => { @@ -74,16 +74,16 @@ describe("SAPI SSML Handling", () => { This should work correctly. `; - + const audioBytes = await client!.synthToBytes(properSSML); - + expect(audioBytes).toBeInstanceOf(Uint8Array); expect(audioBytes.length).toBeGreaterThan(0); - + // Check for WAV header - const wavHeader = Buffer.from(audioBytes.slice(0, 12)).toString('ascii'); - expect(wavHeader.startsWith('RIFF')).toBe(true); - expect(wavHeader.includes('WAVE')).toBe(true); + const wavHeader = Buffer.from(audioBytes.slice(0, 12)).toString("ascii"); + expect(wavHeader.startsWith("RIFF")).toBe(true); + expect(wavHeader.includes("WAVE")).toBe(true); }); runTest("should handle SSML fragments by wrapping them properly", async () => { @@ -92,16 +92,16 @@ describe("SAPI SSML Handling", () => { const ssmlFragment = `This is a test with SSML tags. The tags should not be read literally.`; - + const audioBytes = await client!.synthToBytes(ssmlFragment); - + expect(audioBytes).toBeInstanceOf(Uint8Array); expect(audioBytes.length).toBeGreaterThan(0); - + // Check for WAV header - const wavHeader = Buffer.from(audioBytes.slice(0, 12)).toString('ascii'); - expect(wavHeader.startsWith('RIFF')).toBe(true); - expect(wavHeader.includes('WAVE')).toBe(true); + const wavHeader = Buffer.from(audioBytes.slice(0, 12)).toString("ascii"); + expect(wavHeader.startsWith("RIFF")).toBe(true); + expect(wavHeader.includes("WAVE")).toBe(true); }); runTest("should handle simple speak tags without version attributes", async () => { @@ -110,16 +110,16 @@ describe("SAPI SSML Handling", () => { It should work correctly. `; - + const audioBytes = await client!.synthToBytes(simpleSSML); - + expect(audioBytes).toBeInstanceOf(Uint8Array); expect(audioBytes.length).toBeGreaterThan(0); - + // Check for WAV header - const wavHeader = Buffer.from(audioBytes.slice(0, 12)).toString('ascii'); - expect(wavHeader.startsWith('RIFF')).toBe(true); - expect(wavHeader.includes('WAVE')).toBe(true); + const wavHeader = Buffer.from(audioBytes.slice(0, 12)).toString("ascii"); + expect(wavHeader.startsWith("RIFF")).toBe(true); + expect(wavHeader.includes("WAVE")).toBe(true); }); runTest("should handle prosody tags correctly", async () => { @@ -128,16 +128,16 @@ describe("SAPI SSML Handling", () => { This text should be spoken quickly with a high pitch. `; - + const audioBytes = await client!.synthToBytes(prosodySSML); - + expect(audioBytes).toBeInstanceOf(Uint8Array); expect(audioBytes.length).toBeGreaterThan(0); - + // Check for WAV header - const wavHeader = Buffer.from(audioBytes.slice(0, 12)).toString('ascii'); - expect(wavHeader.startsWith('RIFF')).toBe(true); - expect(wavHeader.includes('WAVE')).toBe(true); + const wavHeader = Buffer.from(audioBytes.slice(0, 12)).toString("ascii"); + expect(wavHeader.startsWith("RIFF")).toBe(true); + expect(wavHeader.includes("WAVE")).toBe(true); }); runTest("should handle mixed content with various SSML elements", async () => { @@ -157,9 +157,9 @@ describe("SAPI SSML Handling", () => { expect(audioBytes.length).toBeGreaterThan(0); // Check for WAV header - const wavHeader = Buffer.from(audioBytes.slice(0, 12)).toString('ascii'); - expect(wavHeader.startsWith('RIFF')).toBe(true); - expect(wavHeader.includes('WAVE')).toBe(true); + const wavHeader = Buffer.from(audioBytes.slice(0, 12)).toString("ascii"); + expect(wavHeader.startsWith("RIFF")).toBe(true); + expect(wavHeader.includes("WAVE")).toBe(true); }); runTest("should respect voice selection with setVoice", async () => { @@ -168,7 +168,7 @@ describe("SAPI SSML Handling", () => { expect(voices.length).toBeGreaterThan(0); // Find a German voice if available (like TTS_MS_DE-DE_HEDDA_11.0) - const germanVoice = voices.find(v => v.id.includes('DE-DE') || v.name.includes('German')); + const germanVoice = voices.find((v) => v.id.includes("DE-DE") || v.name.includes("German")); if (germanVoice) { // Set the German voice @@ -182,9 +182,9 @@ describe("SAPI SSML Handling", () => { expect(audioBytes.length).toBeGreaterThan(0); // Check for WAV header - const wavHeader = Buffer.from(audioBytes.slice(0, 12)).toString('ascii'); - expect(wavHeader.startsWith('RIFF')).toBe(true); - expect(wavHeader.includes('WAVE')).toBe(true); + const wavHeader = Buffer.from(audioBytes.slice(0, 12)).toString("ascii"); + expect(wavHeader.startsWith("RIFF")).toBe(true); + expect(wavHeader.includes("WAVE")).toBe(true); } else { // If no German voice available, test with any available voice const firstVoice = voices[0]; @@ -197,9 +197,9 @@ describe("SAPI SSML Handling", () => { expect(audioBytes.length).toBeGreaterThan(0); // Check for WAV header - const wavHeader = Buffer.from(audioBytes.slice(0, 12)).toString('ascii'); - expect(wavHeader.startsWith('RIFF')).toBe(true); - expect(wavHeader.includes('WAVE')).toBe(true); + const wavHeader = Buffer.from(audioBytes.slice(0, 12)).toString("ascii"); + expect(wavHeader.startsWith("RIFF")).toBe(true); + expect(wavHeader.includes("WAVE")).toBe(true); } }); @@ -218,8 +218,8 @@ describe("SAPI SSML Handling", () => { expect(audioBytes.length).toBeGreaterThan(0); // Check for WAV header - const wavHeader = Buffer.from(audioBytes.slice(0, 12)).toString('ascii'); - expect(wavHeader.startsWith('RIFF')).toBe(true); - expect(wavHeader.includes('WAVE')).toBe(true); + const wavHeader = Buffer.from(audioBytes.slice(0, 12)).toString("ascii"); + expect(wavHeader.startsWith("RIFF")).toBe(true); + expect(wavHeader.includes("WAVE")).toBe(true); }); }); diff --git a/src/__tests__/sapi-voice-selection.test.ts b/src/__tests__/sapi-voice-selection.test.ts index e8e65ae..21aea7f 100644 --- a/src/__tests__/sapi-voice-selection.test.ts +++ b/src/__tests__/sapi-voice-selection.test.ts @@ -1,10 +1,10 @@ -import { SAPITTSClient } from "../engines/sapi"; -import { describe, it, expect, beforeAll } from '@jest/globals'; +import { beforeAll, describe, expect, it } from "@jest/globals"; import * as os from "os"; +import { SAPITTSClient } from "../engines/sapi"; /** * SAPI Voice Selection Tests - * + * * This test suite specifically verifies that the SAPI voice selection bug is fixed: * - setVoice() should properly select the specified voice * - Voice selection via options parameter should work @@ -35,7 +35,7 @@ describe("SAPI Voice Selection", () => { console.log("Skipping SAPI tests - not on Windows"); return; } - + expect(client).not.toBeNull(); }); @@ -48,9 +48,9 @@ describe("SAPI Voice Selection", () => { const voices = await client.getVoices(); expect(voices).toBeDefined(); expect(voices.length).toBeGreaterThan(0); - + console.log(`Found ${voices.length} SAPI voices:`); - voices.forEach(voice => { + voices.forEach((voice) => { console.log(` - ${voice.id}: ${voice.name} (${voice.gender})`); }); }); @@ -75,11 +75,13 @@ describe("SAPI Voice Selection", () => { expect(audioBytes.length).toBeGreaterThan(0); // Check for WAV header - const wavHeader = Buffer.from(audioBytes.slice(0, 12)).toString('ascii'); - expect(wavHeader.startsWith('RIFF')).toBe(true); - expect(wavHeader.includes('WAVE')).toBe(true); + const wavHeader = Buffer.from(audioBytes.slice(0, 12)).toString("ascii"); + expect(wavHeader.startsWith("RIFF")).toBe(true); + expect(wavHeader.includes("WAVE")).toBe(true); - console.log(`✓ Successfully synthesized ${audioBytes.length} bytes with voice: ${testVoice.id}`); + console.log( + `✓ Successfully synthesized ${audioBytes.length} bytes with voice: ${testVoice.id}` + ); }); it("should respect voice selection via options parameter", async () => { @@ -101,11 +103,13 @@ describe("SAPI Voice Selection", () => { expect(audioBytes.length).toBeGreaterThan(0); // Check for WAV header - const wavHeader = Buffer.from(audioBytes.slice(0, 12)).toString('ascii'); - expect(wavHeader.startsWith('RIFF')).toBe(true); - expect(wavHeader.includes('WAVE')).toBe(true); + const wavHeader = Buffer.from(audioBytes.slice(0, 12)).toString("ascii"); + expect(wavHeader.startsWith("RIFF")).toBe(true); + expect(wavHeader.includes("WAVE")).toBe(true); - console.log(`✓ Successfully synthesized ${audioBytes.length} bytes via options with voice: ${testVoice.id}`); + console.log( + `✓ Successfully synthesized ${audioBytes.length} bytes via options with voice: ${testVoice.id}` + ); }); it("should work with different voices if available", async () => { @@ -161,9 +165,9 @@ describe("SAPI Voice Selection", () => { expect(audioBytes.length).toBeGreaterThan(0); // Check for WAV header - const wavHeader = Buffer.from(audioBytes.slice(0, 12)).toString('ascii'); - expect(wavHeader.startsWith('RIFF')).toBe(true); - expect(wavHeader.includes('WAVE')).toBe(true); + const wavHeader = Buffer.from(audioBytes.slice(0, 12)).toString("ascii"); + expect(wavHeader.startsWith("RIFF")).toBe(true); + expect(wavHeader.includes("WAVE")).toBe(true); console.log(`✓ Successfully synthesized complex SSML: ${audioBytes.length} bytes`); }); @@ -189,9 +193,9 @@ describe("SAPI Voice Selection", () => { expect(audioBytes.length).toBeGreaterThan(0); // Check for WAV header - const wavHeader = Buffer.from(audioBytes.slice(0, 12)).toString('ascii'); - expect(wavHeader.startsWith('RIFF')).toBe(true); - expect(wavHeader.includes('WAVE')).toBe(true); + const wavHeader = Buffer.from(audioBytes.slice(0, 12)).toString("ascii"); + expect(wavHeader.startsWith("RIFF")).toBe(true); + expect(wavHeader.includes("WAVE")).toBe(true); console.log(`✓ Successfully synthesized SSML with version: ${audioBytes.length} bytes`); }); diff --git a/src/__tests__/sherpa-optional-import.test.ts b/src/__tests__/sherpa-optional-import.test.ts index da37c30..3c8e7e0 100644 --- a/src/__tests__/sherpa-optional-import.test.ts +++ b/src/__tests__/sherpa-optional-import.test.ts @@ -1,21 +1,20 @@ -import { describe, it, expect } from '@jest/globals'; +import { describe, expect, it } from "@jest/globals"; // This test ensures that importing the package entry does not eagerly require // optional native dependencies like `sherpa-onnx-node`. The original bug report // indicated that simply requiring the package failed across all engines because // sherpa-onnx was pulled in at import-time. -describe('package import (no sherpa installed)', () => { - it('does not throw when importing the root entry', async () => { +describe("package import (no sherpa installed)", () => { + it("does not throw when importing the root entry", async () => { // Use isolateModules to ensure a fresh module load for this test await expect( (async () => { // Import the library entry. This must not attempt to load sherpa-onnx-node. - const mod = await import('..'); + const mod = await import(".."); // Basic sanity check: a known export should exist - expect(mod).toHaveProperty('AzureTTSClient'); + expect(mod).toHaveProperty("AzureTTSClient"); })() ).resolves.not.toThrow(); }); }); - diff --git a/src/__tests__/speech-markdown-converter.test.ts b/src/__tests__/speech-markdown-converter.test.ts index a7e948e..48f4c98 100644 --- a/src/__tests__/speech-markdown-converter.test.ts +++ b/src/__tests__/speech-markdown-converter.test.ts @@ -32,7 +32,8 @@ describe("SpeechMarkdown", () => { }); it("should convert rate modifier (text)[rate:'slow'] to SSML", async () => { - const markdown = "(I can speak text slow)[rate:'x-slow'] and (I can speak text fast)[rate:'x-fast']"; + const markdown = + "(I can speak text slow)[rate:'x-slow'] and (I can speak text fast)[rate:'x-fast']"; const result = await SpeechMarkdown.toSSML(markdown, "microsoft-azure"); expect(result).toContain(' { }); it("should convert pitch modifier (text)[pitch:'high'] to SSML", async () => { - const markdown = "(I can speak text high)[pitch:'high'] and (I can speak text low)[pitch:'low']"; + const markdown = + "(I can speak text high)[pitch:'high'] and (I can speak text low)[pitch:'low']"; const result = await SpeechMarkdown.toSSML(markdown, "microsoft-azure"); expect(result).toContain(' { }); it("should convert volume modifier (text)[volume:'loud'] to SSML", async () => { - const markdown = "(I can speak text loud)[volume:'loud'] and (I can speak text soft)[volume:'soft']"; + const markdown = + "(I can speak text loud)[volume:'loud'] and (I can speak text soft)[volume:'soft']"; const result = await SpeechMarkdown.toSSML(markdown, "microsoft-azure"); expect(result).toContain(' { - expect(SpeechMarkdown.isSpeechMarkdown("Hello (slowly)[rate:\"slow\"] world")).toBe(true); + expect(SpeechMarkdown.isSpeechMarkdown('Hello (slowly)[rate:"slow"] world')).toBe(true); }); it("should detect pitch", () => { - expect(SpeechMarkdown.isSpeechMarkdown("Hello (high)[pitch:\"high\"] world")).toBe(true); + expect(SpeechMarkdown.isSpeechMarkdown('Hello (high)[pitch:"high"] world')).toBe(true); }); it("should detect volume", () => { - expect(SpeechMarkdown.isSpeechMarkdown("Hello (loud)[volume:\"loud\"] world")).toBe(true); + expect(SpeechMarkdown.isSpeechMarkdown('Hello (loud)[volume:"loud"] world')).toBe(true); }); it("should return false for plain text", () => { diff --git a/src/__tests__/ssml-compatibility.test.ts b/src/__tests__/ssml-compatibility.test.ts index 89e20c9..a7fc7a3 100644 --- a/src/__tests__/ssml-compatibility.test.ts +++ b/src/__tests__/ssml-compatibility.test.ts @@ -1,10 +1,10 @@ +import { describe, expect, it } from "@jest/globals"; import { SSMLCompatibilityManager } from "../core/ssml-compatibility"; import * as SSMLUtils from "../core/ssml-utils"; -import { describe, it, expect } from '@jest/globals'; /** * SSML Compatibility Tests - * + * * This test suite verifies that the SSML compatibility layer: * 1. Correctly identifies engine capabilities * 2. Validates SSML for different engines @@ -15,57 +15,57 @@ import { describe, it, expect } from '@jest/globals'; describe("SSML Compatibility Manager", () => { describe("Engine Capabilities", () => { it("should return correct capabilities for SAPI", () => { - const capabilities = SSMLCompatibilityManager.getCapabilities('sapi'); + const capabilities = SSMLCompatibilityManager.getCapabilities("sapi"); expect(capabilities.supportsSSML).toBe(true); - expect(capabilities.supportLevel).toBe('full'); + expect(capabilities.supportLevel).toBe("full"); expect(capabilities.requiresVersion).toBe(true); expect(capabilities.requiresNamespace).toBe(false); }); it("should return correct capabilities for Azure", () => { - const capabilities = SSMLCompatibilityManager.getCapabilities('azure'); + const capabilities = SSMLCompatibilityManager.getCapabilities("azure"); expect(capabilities.supportsSSML).toBe(true); - expect(capabilities.supportLevel).toBe('full'); + expect(capabilities.supportLevel).toBe("full"); expect(capabilities.requiresVersion).toBe(true); expect(capabilities.requiresNamespace).toBe(true); }); it("should return correct capabilities for ElevenLabs", () => { - const capabilities = SSMLCompatibilityManager.getCapabilities('elevenlabs'); + const capabilities = SSMLCompatibilityManager.getCapabilities("elevenlabs"); expect(capabilities.supportsSSML).toBe(false); - expect(capabilities.supportLevel).toBe('none'); - expect(capabilities.unsupportedTags).toContain('*'); + expect(capabilities.supportLevel).toBe("none"); + expect(capabilities.unsupportedTags).toContain("*"); }); it("should return no SSML support for unknown engines", () => { - const capabilities = SSMLCompatibilityManager.getCapabilities('unknown-engine'); + const capabilities = SSMLCompatibilityManager.getCapabilities("unknown-engine"); expect(capabilities.supportsSSML).toBe(false); - expect(capabilities.supportLevel).toBe('none'); + expect(capabilities.supportLevel).toBe("none"); }); }); describe("Voice-Specific Capabilities", () => { it("should detect Polly neural voice limitations", () => { - const capabilities = SSMLCompatibilityManager.getCapabilities('polly', 'Joanna-Neural'); - expect(capabilities.supportLevel).toBe('limited'); - expect(capabilities.unsupportedTags).toContain('emphasis'); + const capabilities = SSMLCompatibilityManager.getCapabilities("polly", "Joanna-Neural"); + expect(capabilities.supportLevel).toBe("limited"); + expect(capabilities.unsupportedTags).toContain("emphasis"); }); it("should detect Polly standard voice full support", () => { - const capabilities = SSMLCompatibilityManager.getCapabilities('polly', 'Joanna'); - expect(capabilities.supportLevel).toBe('full'); + const capabilities = SSMLCompatibilityManager.getCapabilities("polly", "Joanna"); + expect(capabilities.supportLevel).toBe("full"); expect(capabilities.unsupportedTags).toHaveLength(0); }); it("should detect Google Neural2 voice limitations", () => { - const capabilities = SSMLCompatibilityManager.getCapabilities('google', 'en-US-Neural2-F'); - expect(capabilities.supportLevel).toBe('limited'); - expect(capabilities.unsupportedTags).toContain('mark'); + const capabilities = SSMLCompatibilityManager.getCapabilities("google", "en-US-Neural2-F"); + expect(capabilities.supportLevel).toBe("limited"); + expect(capabilities.unsupportedTags).toContain("mark"); }); it("should detect Google Standard voice full support", () => { - const capabilities = SSMLCompatibilityManager.getCapabilities('google', 'en-US-Standard-A'); - expect(capabilities.supportLevel).toBe('full'); + const capabilities = SSMLCompatibilityManager.getCapabilities("google", "en-US-Standard-A"); + expect(capabilities.supportLevel).toBe("full"); }); }); @@ -79,28 +79,28 @@ describe("SSML Compatibility Manager", () => { `; it("should validate SSML for full support engines", () => { - const result = SSMLCompatibilityManager.validateSSML(validSSML, 'sapi'); + const result = SSMLCompatibilityManager.validateSSML(validSSML, "sapi"); expect(result.isValid).toBe(true); expect(result.errors).toHaveLength(0); }); it("should warn about unsupported tags for limited engines", () => { - const result = SSMLCompatibilityManager.validateSSML(validSSML, 'polly', 'Joanna-Neural'); + const result = SSMLCompatibilityManager.validateSSML(validSSML, "polly", "Joanna-Neural"); expect(result.isValid).toBe(true); - expect(result.warnings.some(w => w.includes('emphasis'))).toBe(true); + expect(result.warnings.some((w) => w.includes("emphasis"))).toBe(true); }); it("should warn about no SSML support for non-SSML engines", () => { - const result = SSMLCompatibilityManager.validateSSML(validSSML, 'elevenlabs'); + const result = SSMLCompatibilityManager.validateSSML(validSSML, "elevenlabs"); expect(result.isValid).toBe(true); - expect(result.warnings.some(w => w.includes('does not support SSML'))).toBe(true); + expect(result.warnings.some((w) => w.includes("does not support SSML"))).toBe(true); }); it("should detect invalid SSML structure", () => { const invalidSSML = "This is not SSML"; - const result = SSMLCompatibilityManager.validateSSML(invalidSSML, 'sapi'); + const result = SSMLCompatibilityManager.validateSSML(invalidSSML, "sapi"); expect(result.isValid).toBe(false); - expect(result.errors.some(e => e.includes('wrapped in tags'))).toBe(true); + expect(result.errors.some((e) => e.includes("wrapped in tags"))).toBe(true); }); }); @@ -112,29 +112,33 @@ describe("SSML Compatibility Manager", () => { `; it("should preserve SSML for full support engines", () => { - const processed = SSMLCompatibilityManager.processSSMLForEngine(testSSML, 'sapi'); - expect(processed).toContain(' { - const processed = SSMLCompatibilityManager.processSSMLForEngine(testSSML, 'polly', 'Joanna-Neural'); - expect(processed).toContain(' { - const processed = SSMLCompatibilityManager.processSSMLForEngine(testSSML, 'elevenlabs'); - expect(processed).not.toContain(' { - const processed = SSMLCompatibilityManager.processSSMLForEngine(testSSML, 'azure'); + const processed = SSMLCompatibilityManager.processSSMLForEngine(testSSML, "azure"); expect(processed).toContain('xmlns="http://www.w3.org/2001/10/synthesis"'); expect(processed).toContain('version="1.0"'); }); @@ -142,14 +146,14 @@ describe("SSML Compatibility Manager", () => { describe("Integration with SSMLUtils", () => { it("should integrate validateSSMLForEngine function", () => { - const testSSML = 'Hello world'; - const result = SSMLUtils.validateSSMLForEngine(testSSML, 'sapi'); + const testSSML = "Hello world"; + const result = SSMLUtils.validateSSMLForEngine(testSSML, "sapi"); expect(result.isValid).toBe(true); }); it("should integrate processSSMLForEngine function", () => { - const testSSML = 'Hello world'; - const processed = SSMLUtils.processSSMLForEngine(testSSML, 'sapi'); + const testSSML = "Hello world"; + const processed = SSMLUtils.processSSMLForEngine(testSSML, "sapi"); expect(processed).toContain('version="1.0"'); }); }); @@ -167,28 +171,28 @@ describe("SSML Compatibility Manager", () => { `; it("should handle complex SSML for Azure", () => { - const processed = SSMLUtils.processSSMLForEngine(complexSSML, 'azure'); - expect(processed).toContain('xmlns='); - expect(processed).toContain('version='); - expect(processed).toContain(' { - const processed = SSMLUtils.processSSMLForEngine(complexSSML, 'polly', 'Joanna-Neural'); - expect(processed).toContain(' { - const processed = SSMLUtils.processSSMLForEngine(complexSSML, 'openai'); - expect(processed).not.toContain(' { expect(fs.statSync(outputPath).size).toBeGreaterThan(0); console.log(`${engineName}: Basic SSML test passed`); - + // Clean up if (fs.existsSync(outputPath)) { fs.unlinkSync(outputPath); @@ -222,7 +242,7 @@ describe("Comprehensive SSML Testing", () => { expect(fs.statSync(outputPath).size).toBeGreaterThan(0); console.log(`${engineName}: Prosody SSML test passed`); - + // Clean up if (fs.existsSync(outputPath)) { fs.unlinkSync(outputPath); @@ -261,7 +281,7 @@ describe("Comprehensive SSML Testing", () => { expect(fs.statSync(outputPath).size).toBeGreaterThan(0); console.log(`${engineName}: Break SSML test passed`); - + // Clean up if (fs.existsSync(outputPath)) { fs.unlinkSync(outputPath); @@ -298,7 +318,7 @@ describe("Comprehensive SSML Testing", () => { expect(fs.statSync(outputPath).size).toBeGreaterThan(0); console.log(`${engineName}: Emphasis SSML test passed`); - + // Clean up if (fs.existsSync(outputPath)) { fs.unlinkSync(outputPath); @@ -368,7 +388,7 @@ describe("Comprehensive SSML Testing", () => { expect(fs.statSync(outputPath).size).toBeGreaterThan(0); console.log(`${engineName}: SSML stripping test passed`); - + // Clean up if (fs.existsSync(outputPath)) { fs.unlinkSync(outputPath); @@ -396,23 +416,25 @@ function isServiceIssue(error: any): boolean { const errorStatus = error?.status || 0; // Check for service issues that can occur during synthesis even with valid credentials - return errorMessage.includes('quota') || - errorMessage.includes('rate limit') || - errorMessage.includes('ratelimiterror') || - errorMessage.includes('exceeded your current quota') || - errorMessage.includes('service unavailable') || - errorMessage.includes('temporarily unavailable') || - errorMessage.includes('server error') || - errorMessage.includes('404') || - // Network-layer issues we should treat as transient/unreliable in CI - errorMessage.includes('terminated') || - errorMessage.includes('econnreset') || - errorMessage.includes('und_err_socket') || - errorMessage.includes('socket') || - errorStatus === 429 || // Rate limit - errorStatus === 404 || // Not found (e.g., API feature not enabled) - errorStatus === 500 || // Server error - errorStatus === 502 || // Bad gateway - errorStatus === 503 || // Service unavailable - errorStatus === 504; // Gateway timeout + return ( + errorMessage.includes("quota") || + errorMessage.includes("rate limit") || + errorMessage.includes("ratelimiterror") || + errorMessage.includes("exceeded your current quota") || + errorMessage.includes("service unavailable") || + errorMessage.includes("temporarily unavailable") || + errorMessage.includes("server error") || + errorMessage.includes("404") || + // Network-layer issues we should treat as transient/unreliable in CI + errorMessage.includes("terminated") || + errorMessage.includes("econnreset") || + errorMessage.includes("und_err_socket") || + errorMessage.includes("socket") || + errorStatus === 429 || // Rate limit + errorStatus === 404 || // Not found (e.g., API feature not enabled) + errorStatus === 500 || // Server error + errorStatus === 502 || // Bad gateway + errorStatus === 503 || // Service unavailable + errorStatus === 504 + ); // Gateway timeout } diff --git a/src/__tests__/tts-engine.test.ts b/src/__tests__/tts-engine.test.ts index 3eb12d4..2685de3 100644 --- a/src/__tests__/tts-engine.test.ts +++ b/src/__tests__/tts-engine.test.ts @@ -1,20 +1,19 @@ +import { afterAll, beforeAll, describe, expect, it } from "@jest/globals"; import * as fs from "fs"; -import * as path from "path"; import * as os from "os"; -import { describe, it, expect, beforeAll, afterAll } from '@jest/globals'; +import * as path from "path"; import type { AbstractTTSClient } from "../core/abstract-tts"; import { AzureTTSClient } from "../engines/azure"; import { ElevenLabsTTSClient } from "../engines/elevenlabs"; import { GoogleTTSClient } from "../engines/google"; -import { PollyTTSClient } from "../engines/polly"; - import { OpenAITTSClient } from "../engines/openai"; import { PlayHTTTSClient } from "../engines/playht"; -import { WatsonTTSClient } from "../engines/watson"; -import { WitAITTSClient } from "../engines/witai"; +import { PollyTTSClient } from "../engines/polly"; import { SAPITTSClient } from "../engines/sapi"; import { UpliftAITTSClient } from "../engines/upliftai"; +import { WatsonTTSClient } from "../engines/watson"; +import { WitAITTSClient } from "../engines/witai"; import { MockTTSClient } from "./mock-tts-client.helper"; // Use mocks for tests to avoid API calls @@ -63,7 +62,11 @@ async function createTTSClient(engine: string): Promise { @@ -178,7 +194,7 @@ engines.forEach((engineName) => { afterAll(async () => { // Give any pending async operations time to complete - await new Promise(resolve => setTimeout(resolve, 100)); + await new Promise((resolve) => setTimeout(resolve, 100)); client = null; }); @@ -204,11 +220,12 @@ engines.forEach((engineName) => { } catch (error) { // Check if this is a known service/credential issue const errorMessage = error instanceof Error ? error.message : String(error); - const isServiceIssue = errorMessage.includes('credentials') || - errorMessage.includes('unauthorized') || - errorMessage.includes('quota') || - errorMessage.includes('rate limit') || - errorMessage.includes('service unavailable'); + const isServiceIssue = + errorMessage.includes("credentials") || + errorMessage.includes("unauthorized") || + errorMessage.includes("quota") || + errorMessage.includes("rate limit") || + errorMessage.includes("service unavailable"); if (isServiceIssue) { console.log(`${engineName}: Skipping test due to service issue:`, errorMessage); @@ -246,11 +263,12 @@ engines.forEach((engineName) => { } catch (error) { // Check if this is a known service/credential issue const errorMessage = error instanceof Error ? error.message : String(error); - const isServiceIssue = errorMessage.includes('credentials') || - errorMessage.includes('unauthorized') || - errorMessage.includes('quota') || - errorMessage.includes('rate limit') || - errorMessage.includes('service unavailable'); + const isServiceIssue = + errorMessage.includes("credentials") || + errorMessage.includes("unauthorized") || + errorMessage.includes("quota") || + errorMessage.includes("rate limit") || + errorMessage.includes("service unavailable"); if (isServiceIssue) { console.log(`${engineName}: Skipping test due to service issue:`, errorMessage); @@ -314,14 +332,18 @@ engines.forEach((engineName) => { } catch (error) { // Check if this is a known service/credential issue const errorMessage = error instanceof Error ? error.message : String(error); - const isServiceIssue = errorMessage.includes('credentials') || - errorMessage.includes('unauthorized') || - errorMessage.includes('quota') || - errorMessage.includes('rate limit') || - errorMessage.includes('service unavailable'); + const isServiceIssue = + errorMessage.includes("credentials") || + errorMessage.includes("unauthorized") || + errorMessage.includes("quota") || + errorMessage.includes("rate limit") || + errorMessage.includes("service unavailable"); if (isServiceIssue) { - console.log(`${engineName}: Skipping synthToBytes test due to service issue:`, errorMessage); + console.log( + `${engineName}: Skipping synthToBytes test due to service issue:`, + errorMessage + ); return; } else { console.error(`${engineName}: Unexpected error in synthToBytes:`, error); @@ -367,11 +389,12 @@ engines.forEach((engineName) => { } catch (error) { // Check if this is a known service/credential issue const errorMessage = error instanceof Error ? error.message : String(error); - const isServiceIssue = errorMessage.includes('credentials') || - errorMessage.includes('unauthorized') || - errorMessage.includes('quota') || - errorMessage.includes('rate limit') || - errorMessage.includes('service unavailable'); + const isServiceIssue = + errorMessage.includes("credentials") || + errorMessage.includes("unauthorized") || + errorMessage.includes("quota") || + errorMessage.includes("rate limit") || + errorMessage.includes("service unavailable"); if (isServiceIssue) { console.log(`${engineName}: Skipping SSML test due to service issue:`, errorMessage); @@ -445,11 +468,12 @@ engines.forEach((engineName) => { } catch (error) { // Check if this is a known service/credential issue const errorMessage = error instanceof Error ? error.message : String(error); - const isServiceIssue = errorMessage.includes('credentials') || - errorMessage.includes('unauthorized') || - errorMessage.includes('quota') || - errorMessage.includes('rate limit') || - errorMessage.includes('service unavailable'); + const isServiceIssue = + errorMessage.includes("credentials") || + errorMessage.includes("unauthorized") || + errorMessage.includes("quota") || + errorMessage.includes("rate limit") || + errorMessage.includes("service unavailable"); if (isServiceIssue) { console.log(`${engineName}: Skipping streaming test due to service issue:`, errorMessage); @@ -495,14 +519,18 @@ engines.forEach((engineName) => { } catch (error) { // Check if this is a known service/credential issue const errorMessage = error instanceof Error ? error.message : String(error); - const isServiceIssue = errorMessage.includes('credentials') || - errorMessage.includes('unauthorized') || - errorMessage.includes('quota') || - errorMessage.includes('rate limit') || - errorMessage.includes('service unavailable'); + const isServiceIssue = + errorMessage.includes("credentials") || + errorMessage.includes("unauthorized") || + errorMessage.includes("quota") || + errorMessage.includes("rate limit") || + errorMessage.includes("service unavailable"); if (isServiceIssue) { - console.log(`${engineName}: Skipping word boundary test due to service issue:`, errorMessage); + console.log( + `${engineName}: Skipping word boundary test due to service issue:`, + errorMessage + ); return; } else { console.error(`${engineName}: Unexpected error in word boundary events:`, error); diff --git a/src/__tests__/unrealspeech.test.ts b/src/__tests__/unrealspeech.test.ts index 6eb312c..eba41a2 100644 --- a/src/__tests__/unrealspeech.test.ts +++ b/src/__tests__/unrealspeech.test.ts @@ -1,18 +1,45 @@ -import { describe, it, expect, jest, beforeEach } from "@jest/globals"; +import { beforeEach, describe, expect, it, jest } from "@jest/globals"; import { UnrealSpeechTTSClient } from "../engines/unrealspeech"; import { createTTSClient } from "../factory"; describe("UnrealSpeechTTSClient", () => { let client: UnrealSpeechTTSClient; - beforeEach(() => { client = new UnrealSpeechTTSClient({ apiKey: "test" }); }); + beforeEach(() => { + client = new UnrealSpeechTTSClient({ apiKey: "test" }); + }); - it("initializes with defaults", () => { expect(client.getProperty("voice")).toBe("Sierra"); }); - it("sets voice via voiceId", () => { client.setVoice("Dan"); expect(client.getProperty("voice")).toBe("Dan"); }); - it("checks credentials without key", async () => { expect(await new UnrealSpeechTTSClient({}).checkCredentials()).toBe(false); }); - it("creates via factory", () => { expect(createTTSClient("unrealspeech", { apiKey: "t" })).toBeInstanceOf(UnrealSpeechTTSClient); }); - it("gets voices", async () => { const v = await client.getVoices(); expect(v.length).toBeGreaterThan(0); expect(v[0].provider).toBe("unrealspeech"); }); - it("strips SSML", async () => { expect(await (client as any).prepareText("Hi")).toBe("Hi"); }); - it("creates word timings", () => { (client as any)._createEstimatedWordTimings("a b c"); expect((client as any).timings.length).toBe(3); }); - it("supports events", () => { const fn = jest.fn(); client.on("end", fn); (client as any).emit("end"); expect(fn).toHaveBeenCalled(); }); - it("has correct engine name", () => { expect((client as any).constructor.name).toBe("UnrealSpeechTTSClient"); }); + it("initializes with defaults", () => { + expect(client.getProperty("voice")).toBe("Sierra"); + }); + it("sets voice via voiceId", () => { + client.setVoice("Dan"); + expect(client.getProperty("voice")).toBe("Dan"); + }); + it("checks credentials without key", async () => { + expect(await new UnrealSpeechTTSClient({}).checkCredentials()).toBe(false); + }); + it("creates via factory", () => { + expect(createTTSClient("unrealspeech", { apiKey: "t" })).toBeInstanceOf(UnrealSpeechTTSClient); + }); + it("gets voices", async () => { + const v = await client.getVoices(); + expect(v.length).toBeGreaterThan(0); + expect(v[0].provider).toBe("unrealspeech"); + }); + it("strips SSML", async () => { + expect(await (client as any).prepareText("Hi")).toBe("Hi"); + }); + it("creates word timings", () => { + (client as any)._createEstimatedWordTimings("a b c"); + expect((client as any).timings.length).toBe(3); + }); + it("supports events", () => { + const fn = jest.fn(); + client.on("end", fn); + (client as any).emit("end"); + expect(fn).toHaveBeenCalled(); + }); + it("has correct engine name", () => { + expect((client as any).constructor.name).toBe("UnrealSpeechTTSClient"); + }); }); diff --git a/src/__tests__/upliftai.test.ts b/src/__tests__/upliftai.test.ts index a13947b..dc0a5c8 100644 --- a/src/__tests__/upliftai.test.ts +++ b/src/__tests__/upliftai.test.ts @@ -1,45 +1,45 @@ -import { describe, it, expect, beforeEach, jest } from '@jest/globals'; -import { UpliftAITTSClient } from '../engines/upliftai'; +import { beforeEach, describe, expect, it, jest } from "@jest/globals"; +import { UpliftAITTSClient } from "../engines/upliftai"; // Mock fetch globally const globalAny: any = global; globalAny.fetch = jest.fn(); -describe('UpliftAITTSClient', () => { +describe("UpliftAITTSClient", () => { let client: UpliftAITTSClient; beforeEach(() => { - client = new UpliftAITTSClient({ apiKey: 'test-key' }); + client = new UpliftAITTSClient({ apiKey: "test-key" }); (globalAny.fetch as jest.Mock).mockReset(); }); - it('should get voices from static list', async () => { + it("should get voices from static list", async () => { const voices = await client.getVoices(); expect(Array.isArray(voices)).toBe(true); expect(voices.length).toBe(4); - expect(voices[0]).toHaveProperty('id'); - expect(voices[0]).toHaveProperty('name'); + expect(voices[0]).toHaveProperty("id"); + expect(voices[0]).toHaveProperty("name"); }); - it('should synthesize text to bytes', async () => { - const mockBuffer = new Uint8Array([1,2,3]).buffer; + it("should synthesize text to bytes", async () => { + const mockBuffer = new Uint8Array([1, 2, 3]).buffer; (globalAny.fetch as jest.Mock).mockResolvedValueOnce(new Response(mockBuffer, { status: 200 })); - const bytes = await client.synthToBytes('hello'); + const bytes = await client.synthToBytes("hello"); expect(bytes).toBeInstanceOf(Uint8Array); expect(bytes.length).toBe(3); }); - it('should synthesize text to stream', async () => { + it("should synthesize text to stream", async () => { const stream = new ReadableStream({ start(controller) { - controller.enqueue(new Uint8Array([4,5])); + controller.enqueue(new Uint8Array([4, 5])); controller.close(); - } + }, }); (globalAny.fetch as jest.Mock).mockResolvedValueOnce(new Response(stream, { status: 200 })); - const result = await client.synthToBytestream('hi'); + const result = await client.synthToBytestream("hi"); const reader = result.audioStream.getReader(); const chunks: number[] = []; while (true) { @@ -47,7 +47,7 @@ describe('UpliftAITTSClient', () => { if (done) break; chunks.push(...value); } - expect(chunks).toEqual([4,5]); + expect(chunks).toEqual([4, 5]); expect(result.wordBoundaries).toEqual([]); }); }); diff --git a/src/__tests__/watson.test.ts b/src/__tests__/watson.test.ts index ce8f13f..2373250 100644 --- a/src/__tests__/watson.test.ts +++ b/src/__tests__/watson.test.ts @@ -1,48 +1,49 @@ -import { describe, it, expect, jest, beforeAll } from '@jest/globals'; -import { WatsonTTSClient } from '../engines/watson'; +import { beforeAll, describe, expect, it, jest } from "@jest/globals"; +import { WatsonTTSClient } from "../engines/watson"; // Mock fetch for testing global.fetch = jest.fn(); -describe('WatsonTTSClient', () => { +describe("WatsonTTSClient", () => { let client: WatsonTTSClient; beforeAll(() => { // Create a client with mock credentials client = new WatsonTTSClient({ - apiKey: 'test-api-key', - region: 'us-south', - instanceId: 'test-instance-id' + apiKey: "test-api-key", + region: "us-south", + instanceId: "test-instance-id", }); // Mock the IAM token refresh (global.fetch as jest.Mock).mockImplementation((url: string, options: RequestInit) => { - if (url.includes('iam.cloud.ibm.com/identity/token')) { + if (url.includes("iam.cloud.ibm.com/identity/token")) { return Promise.resolve({ ok: true, - json: () => Promise.resolve({ access_token: 'mock-token' }) + json: () => Promise.resolve({ access_token: "mock-token" }), }); - } else if (url.includes('voices')) { + } else if (url.includes("voices")) { return Promise.resolve({ ok: true, - json: () => Promise.resolve({ - voices: [ - { - name: 'en-US_AllisonV3Voice', - language: 'en-US', - gender: 'female', - description: 'Allison: American English female voice' - }, - { - name: 'en-US_MichaelV3Voice', - language: 'en-US', - gender: 'male', - description: 'Michael: American English male voice' - } - ] - }) + json: () => + Promise.resolve({ + voices: [ + { + name: "en-US_AllisonV3Voice", + language: "en-US", + gender: "female", + description: "Allison: American English female voice", + }, + { + name: "en-US_MichaelV3Voice", + language: "en-US", + gender: "male", + description: "Michael: American English male voice", + }, + ], + }), }); - } else if (url.includes('synthesize')) { + } else if (url.includes("synthesize")) { return Promise.resolve({ ok: true, arrayBuffer: () => Promise.resolve(new ArrayBuffer(1024)), @@ -50,82 +51,82 @@ describe('WatsonTTSClient', () => { start(controller) { controller.enqueue(new Uint8Array(1024)); controller.close(); - } - }) + }, + }), }); } return Promise.reject(new Error(`Unhandled URL: ${url}`)); }); }); - it('should initialize with credentials', () => { + it("should initialize with credentials", () => { expect(client).toBeDefined(); expect(client).toBeInstanceOf(WatsonTTSClient); }); - it('should get voices', async () => { + it("should get voices", async () => { const voices = await client.getVoices(); expect(voices).toBeDefined(); expect(Array.isArray(voices)).toBe(true); expect(voices.length).toBe(2); - + // Check voice structure const voice = voices[0]; - expect(voice).toHaveProperty('id', 'en-US_AllisonV3Voice'); - expect(voice).toHaveProperty('name', 'Allison'); - expect(voice).toHaveProperty('gender', 'Female'); - expect(voice).toHaveProperty('provider', 'ibm'); - expect(voice).toHaveProperty('languageCodes'); - expect(voice.languageCodes[0]).toHaveProperty('bcp47', 'en-US'); + expect(voice).toHaveProperty("id", "en-US_AllisonV3Voice"); + expect(voice).toHaveProperty("name", "Allison"); + expect(voice).toHaveProperty("gender", "Female"); + expect(voice).toHaveProperty("provider", "ibm"); + expect(voice).toHaveProperty("languageCodes"); + expect(voice.languageCodes[0]).toHaveProperty("bcp47", "en-US"); }); - it('should get voices by language', async () => { - const voices = await client.getVoicesByLanguage('en-US'); + it("should get voices by language", async () => { + const voices = await client.getVoicesByLanguage("en-US"); expect(voices).toBeDefined(); expect(Array.isArray(voices)).toBe(true); expect(voices.length).toBe(2); - + // Check that all voices are for the requested language for (const voice of voices) { - expect(voice.languageCodes.some(lang => lang.bcp47 === 'en-US')).toBe(true); + expect(voice.languageCodes.some((lang) => lang.bcp47 === "en-US")).toBe(true); } }); - it('should set voice', () => { - client.setVoice('en-US_AllisonV3Voice'); + it("should set voice", () => { + client.setVoice("en-US_AllisonV3Voice"); // This is testing an internal property, but it's the simplest way to verify - // @ts-ignore - Accessing private property for testing - expect(client.voiceId).toBe('en-US_AllisonV3Voice'); + // @ts-expect-error - Accessing private property for testing + expect(client.voiceId).toBe("en-US_AllisonV3Voice"); }); - it('should synthesize text to bytes', async () => { - const result = await client.synthToBytes('Hello world'); + it("should synthesize text to bytes", async () => { + const result = await client.synthToBytes("Hello world"); expect(result).toBeDefined(); expect(result).toBeInstanceOf(Uint8Array); expect(result.length).toBe(1024); }); - it('should synthesize text to bytestream', async () => { - const result = await client.synthToBytestream('Hello world'); + it("should synthesize text to bytestream", async () => { + const result = await client.synthToBytestream("Hello world"); expect(result).toBeDefined(); - expect(result).toHaveProperty('audioStream'); - expect(result).toHaveProperty('wordBoundaries'); + expect(result).toHaveProperty("audioStream"); + expect(result).toHaveProperty("wordBoundaries"); expect(Array.isArray(result.wordBoundaries)).toBe(true); }); - it('should check credentials', async () => { + it("should check credentials", async () => { const result = await client.checkCredentials(); expect(result).toBe(true); }); - it('should handle invalid credentials', async () => { + it("should handle invalid credentials", async () => { // Mock fetch to simulate invalid credentials const originalFetch = global.fetch; (global.fetch as jest.Mock).mockImplementationOnce((url: string) => { - if (url.includes('voices')) { + if (url.includes("voices")) { return Promise.resolve({ ok: false, - statusText: 'Unauthorized' + statusText: "Unauthorized", }); } return originalFetch(url); @@ -133,9 +134,9 @@ describe('WatsonTTSClient', () => { // Create a new client with invalid credentials const invalidClient = new WatsonTTSClient({ - apiKey: 'invalid-key', - region: 'us-south', - instanceId: 'invalid-instance' + apiKey: "invalid-key", + region: "us-south", + instanceId: "invalid-instance", }); try { diff --git a/src/__tests__/witai.test.ts b/src/__tests__/witai.test.ts index 04a49eb..b5c7a96 100644 --- a/src/__tests__/witai.test.ts +++ b/src/__tests__/witai.test.ts @@ -1,160 +1,158 @@ -import { describe, it, expect, jest, beforeAll } from '@jest/globals'; -import { WitAITTSClient } from '../engines/witai'; +import { beforeAll, describe, expect, it, jest } from "@jest/globals"; +import { WitAITTSClient } from "../engines/witai"; // Mock fetch for testing global.fetch = jest.fn() as jest.MockedFunction; -describe('WitAITTSClient', () => { +describe("WitAITTSClient", () => { let client: WitAITTSClient; beforeAll(() => { // Create a client with mock credentials client = new WitAITTSClient({ - token: 'test-token' + token: "test-token", }); // Mock the API responses - (global.fetch as jest.MockedFunction).mockImplementation((input: RequestInfo | URL, options?: RequestInit) => { - const url = input.toString(); - if (url.includes('/voices')) { - return Promise.resolve( - new Response( - JSON.stringify({ - "en_US": [ - { - "name": "witai$Alex", - "gender": "male", - "styles": ["calm", "cheerful"] - }, - { - "name": "witai$Samantha", - "gender": "female", - "styles": ["calm", "cheerful"] - } - ], - "fr_FR": [ - { - "name": "witai$Jean", - "gender": "male", - "styles": ["calm"] - } - ] - }), - { - status: 200, - headers: new Headers({ 'Content-Type': 'application/json' }) - } - ) - ); - } else if (url.includes('/synthesize')) { - const mockArrayBuffer = new ArrayBuffer(1024); - const mockStream = new ReadableStream({ - start(controller) { - controller.enqueue(new Uint8Array(1024)); - controller.close(); - } - }); - return Promise.resolve( - new Response( - mockStream, - { - status: 200, - headers: new Headers({ 'Content-Type': 'audio/raw' }) - } - ) - ); + (global.fetch as jest.MockedFunction).mockImplementation( + (input: RequestInfo | URL, options?: RequestInit) => { + const url = input.toString(); + if (url.includes("/voices")) { + return Promise.resolve( + new Response( + JSON.stringify({ + en_US: [ + { + name: "witai$Alex", + gender: "male", + styles: ["calm", "cheerful"], + }, + { + name: "witai$Samantha", + gender: "female", + styles: ["calm", "cheerful"], + }, + ], + fr_FR: [ + { + name: "witai$Jean", + gender: "male", + styles: ["calm"], + }, + ], + }), + { + status: 200, + headers: new Headers({ "Content-Type": "application/json" }), + } + ) + ); + } else if (url.includes("/synthesize")) { + const mockArrayBuffer = new ArrayBuffer(1024); + const mockStream = new ReadableStream({ + start(controller) { + controller.enqueue(new Uint8Array(1024)); + controller.close(); + }, + }); + return Promise.resolve( + new Response(mockStream, { + status: 200, + headers: new Headers({ "Content-Type": "audio/raw" }), + }) + ); + } + return Promise.reject(new Error(`Unhandled URL: ${url}`)); } - return Promise.reject(new Error(`Unhandled URL: ${url}`)); - }); + ); }); - it('should initialize with credentials', () => { + it("should initialize with credentials", () => { expect(client).toBeDefined(); expect(client).toBeInstanceOf(WitAITTSClient); }); - it('should throw an error if no token is provided', () => { + it("should throw an error if no token is provided", () => { expect(() => { new WitAITTSClient({ - token: '' + token: "", }); - }).toThrow('An API token for Wit.ai must be provided'); + }).toThrow("An API token for Wit.ai must be provided"); }); - it('should get voices', async () => { + it("should get voices", async () => { const voices = await client.getVoices(); expect(voices).toBeDefined(); expect(Array.isArray(voices)).toBe(true); expect(voices.length).toBe(3); - + // Check voice structure const voice = voices[0]; - expect(voice).toHaveProperty('id'); - expect(voice).toHaveProperty('name'); - expect(voice).toHaveProperty('gender'); - expect(voice).toHaveProperty('provider', 'witai'); - expect(voice).toHaveProperty('languageCodes'); - expect(voice.languageCodes[0]).toHaveProperty('bcp47'); + expect(voice).toHaveProperty("id"); + expect(voice).toHaveProperty("name"); + expect(voice).toHaveProperty("gender"); + expect(voice).toHaveProperty("provider", "witai"); + expect(voice).toHaveProperty("languageCodes"); + expect(voice.languageCodes[0]).toHaveProperty("bcp47"); }); - it('should get voices by language', async () => { - const voices = await client.getVoicesByLanguage('en-US'); + it("should get voices by language", async () => { + const voices = await client.getVoicesByLanguage("en-US"); expect(voices).toBeDefined(); expect(Array.isArray(voices)).toBe(true); expect(voices.length).toBe(2); - + // Check that all voices are for the requested language for (const voice of voices) { - expect(voice.languageCodes.some(lang => lang.bcp47 === 'en-US')).toBe(true); + expect(voice.languageCodes.some((lang) => lang.bcp47 === "en-US")).toBe(true); } }); - it('should set voice', () => { - client.setVoice('witai$Alex'); + it("should set voice", () => { + client.setVoice("witai$Alex"); // This is testing an internal property, but it's the simplest way to verify - // @ts-ignore - Accessing private property for testing - expect(client.voiceId).toBe('witai$Alex'); + // @ts-expect-error - Accessing private property for testing + expect(client.voiceId).toBe("witai$Alex"); }); - it('should synthesize text to bytes', async () => { - const result = await client.synthToBytes('Hello world'); + it("should synthesize text to bytes", async () => { + const result = await client.synthToBytes("Hello world"); expect(result).toBeDefined(); expect(result).toBeInstanceOf(Uint8Array); expect(result.length).toBe(1024); }); - it('should synthesize text to bytestream', async () => { - const result = await client.synthToBytestream('Hello world'); + it("should synthesize text to bytestream", async () => { + const result = await client.synthToBytestream("Hello world"); expect(result).toBeDefined(); - expect(result).toHaveProperty('audioStream'); - expect(result).toHaveProperty('wordBoundaries'); + expect(result).toHaveProperty("audioStream"); + expect(result).toHaveProperty("wordBoundaries"); expect(Array.isArray(result.wordBoundaries)).toBe(true); expect(result.wordBoundaries.length).toBe(2); // "Hello" and "world" }); - it('should handle invalid credentials', async () => { + it("should handle invalid credentials", async () => { // Mock fetch to simulate invalid credentials const originalFetch = global.fetch; - (global.fetch as jest.MockedFunction).mockImplementationOnce((input: RequestInfo | URL) => { - const url = input.toString(); - if (url.includes('/voices')) { - return Promise.resolve( - new Response( - null, - { - status: 401, - statusText: 'Unauthorized', - headers: new Headers({ 'Content-Type': 'application/json' }) - } - ) - ); + (global.fetch as jest.MockedFunction).mockImplementationOnce( + (input: RequestInfo | URL) => { + const url = input.toString(); + if (url.includes("/voices")) { + return Promise.resolve( + new Response(null, { + status: 401, + statusText: "Unauthorized", + headers: new Headers({ "Content-Type": "application/json" }), + }) + ); + } + return originalFetch(url); } - return originalFetch(url); - }); + ); // Create a new client with invalid credentials const invalidClient = new WitAITTSClient({ - token: 'invalid-token' + token: "invalid-token", }); try { diff --git a/src/__tests__/xai.test.ts b/src/__tests__/xai.test.ts index 3cd266d..d7b96f2 100644 --- a/src/__tests__/xai.test.ts +++ b/src/__tests__/xai.test.ts @@ -1,4 +1,4 @@ -import { describe, it, expect, jest, beforeEach } from "@jest/globals"; +import { beforeEach, describe, expect, it, jest } from "@jest/globals"; import { XaiTTSClient } from "../engines/xai"; import { createTTSClient } from "../factory"; diff --git a/src/core/abstract-tts.ts b/src/core/abstract-tts.ts index 1353aa7..809b7db 100644 --- a/src/core/abstract-tts.ts +++ b/src/core/abstract-tts.ts @@ -1,3 +1,4 @@ +import * as SpeechMarkdown from "../markdown/converter"; import { SSMLBuilder } from "../ssml/builder"; import type { CredentialsCheckResult, @@ -10,12 +11,11 @@ import type { UnifiedVoice, WordBoundaryCallback, } from "../types"; -import { LanguageNormalizer } from "./language-utils"; -import * as SSMLUtils from "./ssml-utils"; -import { isBrowser, isNode } from "../utils/environment"; import type { AudioFormat } from "../utils/audio-converter"; import { detectAudioFormat } from "../utils/audio-input"; -import * as SpeechMarkdown from "../markdown/converter"; +import { isBrowser, isNode } from "../utils/environment"; +import { LanguageNormalizer } from "./language-utils"; +import * as SSMLUtils from "./ssml-utils"; /** * Abstract base class for all TTS clients @@ -71,11 +71,12 @@ export abstract class AbstractTTSClient { * Capability signaling for UIs to filter providers without hardcoding names * Engines can override these in their constructors. */ - public capabilities: { browserSupported: boolean; nodeSupported: boolean; needsWasm?: boolean } = { - browserSupported: true, - nodeSupported: true, - needsWasm: false, - }; + public capabilities: { browserSupported: boolean; nodeSupported: boolean; needsWasm?: boolean } = + { + browserSupported: true, + nodeSupported: true, + needsWasm: false, + }; /** * Audio sample rate in Hz @@ -179,25 +180,38 @@ export abstract class AbstractTTSClient { // Try to convert if conversion is available (Node only) if (!isNode) { - console.warn(`Audio format conversion not available in browser. Returning native format (${nativeFormat}) instead of requested format (${requestedFormat})`); + console.warn( + `Audio format conversion not available in browser. Returning native format (${nativeFormat}) instead of requested format (${requestedFormat})` + ); return nativeAudioBytes; } try { - const { isAudioConversionAvailable, convertAudioFormat } = await (new Function('m','return import(m)'))('../utils/audio-converter'); + const { isAudioConversionAvailable, convertAudioFormat } = await new Function( + "m", + "return import(m)" + )("../utils/audio-converter"); if (isAudioConversionAvailable()) { try { const conversionResult = await convertAudioFormat(nativeAudioBytes, requestedFormat); return conversionResult.audioBytes; } catch (error) { - console.warn(`Audio format conversion failed: ${error instanceof Error ? error.message : String(error)}`); - console.warn(`Returning native format (${nativeFormat}) instead of requested format (${requestedFormat})`); + console.warn( + `Audio format conversion failed: ${error instanceof Error ? error.message : String(error)}` + ); + console.warn( + `Returning native format (${nativeFormat}) instead of requested format (${requestedFormat})` + ); } } else { - console.warn(`Audio format conversion not available. Returning native format (${nativeFormat}) instead of requested format (${requestedFormat})`); + console.warn( + `Audio format conversion not available. Returning native format (${nativeFormat}) instead of requested format (${requestedFormat})` + ); } } catch { - console.warn(`Audio converter not available at runtime; returning native format (${nativeFormat})`); + console.warn( + `Audio converter not available at runtime; returning native format (${nativeFormat})` + ); } // Fallback: return native audio @@ -300,11 +314,10 @@ export abstract class AbstractTTSClient { // Check if we're in a browser environment if (isBrowser) { - // Create audio blob and URL with the correct MIME type const ab = new ArrayBuffer(audioBytes.byteLength); - new Uint8Array(ab).set(audioBytes); - const blob = new Blob([ab], { type: mimeType }); + new Uint8Array(ab).set(audioBytes); + const blob = new Blob([ab], { type: mimeType }); const url = URL.createObjectURL(blob); // Create and play audio element @@ -348,7 +361,7 @@ export abstract class AbstractTTSClient { // In Node.js environment, try to use sound-play try { // Check if Node.js audio playback is available - const nodeAudioModule = await import('../utils/node-audio.js'); + const nodeAudioModule = await import("../utils/node-audio.js"); const { isNodeAudioAvailable, playAudioInNode } = nodeAudioModule; const audioAvailable = await isNodeAudioAvailable(); @@ -366,14 +379,20 @@ export abstract class AbstractTTSClient { // Play audio using our node-audio utility // Pass the engine name to handle Polly audio differently - await playAudioInNode(audioBytes, this.sampleRate, this.constructor.name.replace('TTSClient', '').toLowerCase()); + await playAudioInNode( + audioBytes, + this.sampleRate, + this.constructor.name.replace("TTSClient", "").toLowerCase() + ); // Emit end event this.emit("end"); } else { console.log("Audio playback in Node.js requires the sound-play package."); console.log("Install it with: npm install js-tts-wrapper[node-audio]"); - console.log("Or use synthToFile() to save audio to a file and play it with an external player."); + console.log( + "Or use synthToFile() to save audio to a file and play it with an external player." + ); // Fire word boundary callbacks immediately when audio playback is not available this._fireWordBoundaryCallbacks(); @@ -387,7 +406,9 @@ export abstract class AbstractTTSClient { } else { // Unknown environment console.log("Audio playback is not supported in this environment."); - console.log("Use synthToFile() to save audio to a file and play it with an external player."); + console.log( + "Use synthToFile() to save audio to a file and play it with an external player." + ); this.emit("end"); } } catch (error) { @@ -445,24 +466,32 @@ export abstract class AbstractTTSClient { // Apply format conversion if needed (for streaming, we convert the final buffer) if (normalizedOptions?.format) { - if (!isNode) { - // Browser: no conversion; just detect MIME from bytes - mimeType = detectAudioFormat(audioBytes); - } else { + if (isNode) { try { - const { isAudioConversionAvailable, convertAudioFormat } = await (new Function('m','return import(m)'))('../utils/audio-converter'); - if (isAudioConversionAvailable()) { - const conversionResult = await convertAudioFormat(audioBytes, normalizedOptions.format as AudioFormat); - audioBytes = conversionResult.audioBytes; - mimeType = conversionResult.mimeType; - } else { + const { isAudioConversionAvailable, convertAudioFormat } = await new Function( + "m", + "return import(m)" + )("../utils/audio-converter"); + if (isAudioConversionAvailable()) { + const conversionResult = await convertAudioFormat( + audioBytes, + normalizedOptions.format as AudioFormat + ); + audioBytes = conversionResult.audioBytes; + mimeType = conversionResult.mimeType; + } else { + mimeType = detectAudioFormat(audioBytes); + } + } catch (error) { + console.warn( + `Streaming format conversion failed: ${error instanceof Error ? error.message : String(error)}` + ); mimeType = detectAudioFormat(audioBytes); } - } catch (error) { - console.warn(`Streaming format conversion failed: ${error instanceof Error ? error.message : String(error)}`); + } else { + // Browser: no conversion; just detect MIME from bytes mimeType = detectAudioFormat(audioBytes); } - } } else { // Determine MIME type based on actual audio format mimeType = detectAudioFormat(audioBytes); @@ -479,8 +508,6 @@ export abstract class AbstractTTSClient { text = ""; // No text available for audio input } - - // Use actual word boundaries if available, otherwise create estimated ones if (wordBoundaries.length > 0) { // Convert the word boundaries to our internal format @@ -499,11 +526,10 @@ export abstract class AbstractTTSClient { // Check if we're in a browser environment if (isBrowser) { - // Create audio blob and URL with the correct MIME type const ab = new ArrayBuffer(audioBytes.byteLength); - new Uint8Array(ab).set(audioBytes); - const blob = new Blob([ab], { type: mimeType }); + new Uint8Array(ab).set(audioBytes); + const blob = new Blob([ab], { type: mimeType }); const url = URL.createObjectURL(blob); // Create and play audio element @@ -540,10 +566,10 @@ export abstract class AbstractTTSClient { audio.src = url; } else if (isNode) { // In Node.js environment, try to use sound-play - console.log('🔍 Taking Node.js audio path'); + console.log("🔍 Taking Node.js audio path"); try { // Check if Node.js audio playback is available - const nodeAudioModule = await import('../utils/node-audio.js'); + const nodeAudioModule = await import("../utils/node-audio.js"); const { isNodeAudioAvailable, playAudioInNode } = nodeAudioModule; const audioAvailable = await isNodeAudioAvailable(); console.log(`🔍 Audio available: ${audioAvailable}`); @@ -556,20 +582,26 @@ export abstract class AbstractTTSClient { } if (audioAvailable) { - console.log('🔍 Audio available - scheduling word boundary callbacks'); + console.log("🔍 Audio available - scheduling word boundary callbacks"); // Schedule word boundary callbacks this._scheduleWordBoundaryCallbacks(); // Play audio using our node-audio utility with the engine's sample rate // Pass the engine name to handle Polly audio differently - await playAudioInNode(audioBytes, this.sampleRate, this.constructor.name.replace('TTSClient', '').toLowerCase()); + await playAudioInNode( + audioBytes, + this.sampleRate, + this.constructor.name.replace("TTSClient", "").toLowerCase() + ); // Emit end event this.emit("end"); } else { console.log("Audio playback in Node.js requires the sound-play package."); console.log("Install it with: npm install js-tts-wrapper[node-audio]"); - console.log("Or use synthToFile() to save audio to a file and play it with an external player."); + console.log( + "Or use synthToFile() to save audio to a file and play it with an external player." + ); // Fire word boundary callbacks immediately this._fireWordBoundaryCallbacks(); @@ -583,7 +615,9 @@ export abstract class AbstractTTSClient { } else { // Unknown environment console.log("Audio playback is not supported in this environment."); - console.log("Use synthToFile() to save audio to a file and play it with an external player."); + console.log( + "Use synthToFile() to save audio to a file and play it with an external player." + ); // Create estimated word timings if needed and we have text if (text) { @@ -646,7 +680,7 @@ export abstract class AbstractTTSClient { } else if (isNode) { // In Node.js, use the file system const outputPath = filename.endsWith(`.${format}`) ? filename : `${filename}.${format}`; - const fs = await (new Function('m','return import(m)'))('node:fs'); + const fs = await new Function("m", "return import(m)")("node:fs"); fs.writeFileSync(outputPath, Buffer.from(audioBytes)); } else { console.warn("File saving not implemented for this environment."); @@ -681,14 +715,16 @@ export abstract class AbstractTTSClient { // Node.js environment - use node-speaker try { // Import dynamically to avoid circular dependencies - import('./node-audio-control.js').then(nodeAudio => { - const paused = nodeAudio.pauseAudioPlayback(); - if (paused) { - this.audio.isPaused = true; - } - }).catch(error => { - console.error("Error importing node-audio-control:", error); - }); + import("./node-audio-control.js") + .then((nodeAudio) => { + const paused = nodeAudio.pauseAudioPlayback(); + if (paused) { + this.audio.isPaused = true; + } + }) + .catch((error) => { + console.error("Error importing node-audio-control:", error); + }); } catch (error) { console.error("Error pausing audio in Node.js:", error); } @@ -709,14 +745,16 @@ export abstract class AbstractTTSClient { // Node.js environment - use node-speaker try { // Import dynamically to avoid circular dependencies - import('./node-audio-control.js').then(nodeAudio => { - const resumed = nodeAudio.resumeAudioPlayback(); - if (resumed) { - this.audio.isPaused = false; - } - }).catch(error => { - console.error("Error importing node-audio-control:", error); - }); + import("./node-audio-control.js") + .then((nodeAudio) => { + const resumed = nodeAudio.resumeAudioPlayback(); + if (resumed) { + this.audio.isPaused = false; + } + }) + .catch((error) => { + console.error("Error importing node-audio-control:", error); + }); } catch (error) { console.error("Error resuming audio in Node.js:", error); } @@ -739,15 +777,17 @@ export abstract class AbstractTTSClient { // Node.js environment - use node-speaker try { // Import dynamically to avoid circular dependencies - import('./node-audio-control.js').then(nodeAudio => { - const stopped = nodeAudio.stopAudioPlayback(); - if (stopped) { - this.audio.isPlaying = false; - this.audio.isPaused = false; - } - }).catch(error => { - console.error("Error importing node-audio-control:", error); - }); + import("./node-audio-control.js") + .then((nodeAudio) => { + const stopped = nodeAudio.stopAudioPlayback(); + if (stopped) { + this.audio.isPlaying = false; + this.audio.isPaused = false; + } + }) + .catch((error) => { + console.error("Error importing node-audio-control:", error); + }); } catch (error) { console.error("Error stopping audio in Node.js:", error); } @@ -790,7 +830,7 @@ export abstract class AbstractTTSClient { this.emit("boundary", { text: word, offset: Math.round(start * 10000), // Convert to 100-nanosecond units - duration: Math.round((end - start) * 10000) + duration: Math.round((end - start) * 10000), }); } } @@ -807,7 +847,7 @@ export abstract class AbstractTTSClient { const event = { text: word, offset: Math.round(start * 10000), // Convert to 100-nanosecond units - duration: Math.round((end - start) * 10000) + duration: Math.round((end - start) * 10000), }; setTimeout(() => { @@ -837,7 +877,10 @@ export abstract class AbstractTTSClient { /** * Normalize Speech Markdown options to auto-enable conversion when detected. */ - protected normalizeSpeechMarkdownOptions(text: string, options?: SpeakOptions): SpeakOptions | undefined { + protected normalizeSpeechMarkdownOptions( + text: string, + options?: SpeakOptions + ): SpeakOptions | undefined { if (options?.useSpeechMarkdown !== undefined) { return options; } @@ -979,13 +1022,13 @@ export abstract class AbstractTTSClient { const voices = await this._getVoices(); return { success: voices.length > 0, - voiceCount: voices.length + voiceCount: voices.length, }; } catch (error) { console.error("Error checking credentials:", error); return { success: false, - error: error instanceof Error ? error.message : String(error) + error: error instanceof Error ? error.message : String(error), }; } } @@ -997,15 +1040,15 @@ export abstract class AbstractTTSClient { async getCredentialStatus(): Promise<{ valid: boolean; engine: string; - environment: 'browser' | 'node'; + environment: "browser" | "node"; requiresCredentials: boolean; credentialTypes: string[]; message: string; details?: Record; error?: string; }> { - const isBrowser = typeof window !== 'undefined'; - const engineName = this.constructor.name.replace('TTSClient', '').toLowerCase(); + const isBrowser = typeof window !== "undefined"; + const engineName = this.constructor.name.replace("TTSClient", "").toLowerCase(); try { const isValid = await this.checkCredentials(); @@ -1023,26 +1066,26 @@ export abstract class AbstractTTSClient { return { valid: isValid, engine: engineName, - environment: isBrowser ? 'browser' : 'node', + environment: isBrowser ? "browser" : "node", requiresCredentials: requiresCreds, credentialTypes: this.getRequiredCredentials(), - message: isValid ? - `${engineName} credentials are valid and ${voices.length} voices are available` : - `${engineName} credentials are invalid or service is unavailable`, + message: isValid + ? `${engineName} credentials are valid and ${voices.length} voices are available` + : `${engineName} credentials are invalid or service is unavailable`, details: { voiceCount: voices.length, - hasCredentials: Object.keys(this.credentials || {}).length > 0 - } + hasCredentials: Object.keys(this.credentials || {}).length > 0, + }, }; } catch (error) { return { valid: false, engine: engineName, - environment: isBrowser ? 'browser' : 'node', + environment: isBrowser ? "browser" : "node", requiresCredentials: this.getRequiredCredentials().length > 0, credentialTypes: this.getRequiredCredentials(), message: `Error validating ${engineName} credentials`, - error: error instanceof Error ? error.message : String(error) + error: error instanceof Error ? error.message : String(error), }; } } diff --git a/src/engines/azure.ts b/src/engines/azure.ts index c7503d5..59fe6ed 100644 --- a/src/engines/azure.ts +++ b/src/engines/azure.ts @@ -61,7 +61,7 @@ export class AzureTTSClient extends AbstractTTSClient { * @returns Array of required credential field names */ protected getRequiredCredentials(): string[] { - return ['subscriptionKey', 'region']; + return ["subscriptionKey", "region"]; } /** @@ -121,7 +121,9 @@ export class AzureTTSClient extends AbstractTTSClient { */ async synthToBytes(text: string, options?: AzureTTSOptions): Promise { const ssml = await this.prepareSSML(text, options); - console.debug(`${this.constructor.name}.synthToBytes - TTS text ${ssml}, Options: ${JSON.stringify(options)}`); + console.debug( + `${this.constructor.name}.synthToBytes - TTS text ${ssml}, Options: ${JSON.stringify(options)}` + ); try { const response = await fetch( @@ -204,22 +206,24 @@ export class AzureTTSClient extends AbstractTTSClient { return this.sdk; } this.sdkLoadingPromise = new Promise((resolve) => { - const script = document.createElement('script'); - script.src = 'https://aka.ms/csspeech/jsbrowserpackageraw'; + const script = document.createElement("script"); + script.src = "https://aka.ms/csspeech/jsbrowserpackageraw"; script.async = true; script.onload = () => { this.sdk = (window as any).SpeechSDK || null; this.sdkLoadingPromise = null; if (this.sdk) { - console.log('Microsoft Speech SDK (browser) loaded successfully.'); + console.log("Microsoft Speech SDK (browser) loaded successfully."); resolve(this.sdk); } else { - console.warn('Speech SDK script loaded but window.SpeechSDK not found. Falling back to REST.'); + console.warn( + "Speech SDK script loaded but window.SpeechSDK not found. Falling back to REST." + ); resolve(null); } }; script.onerror = () => { - console.warn('Failed to load Microsoft Speech SDK (browser). Falling back to REST.'); + console.warn("Failed to load Microsoft Speech SDK (browser). Falling back to REST."); this.sdkLoadingPromise = null; resolve(null); }; @@ -229,9 +233,8 @@ export class AzureTTSClient extends AbstractTTSClient { } // Node: dynamic import - const dyn: any = new Function('m','return import(m)'); - // @ts-ignore - Suppress module not found error for SDK types during build - this.sdkLoadingPromise = dyn('microsoft-cognitiveservices-speech-sdk') + const dyn: any = new Function("m", "return import(m)"); + this.sdkLoadingPromise = dyn("microsoft-cognitiveservices-speech-sdk") .then((sdkModule: any) => { this.sdk = sdkModule; this.sdkLoadingPromise = null; // Reset promise after successful load @@ -240,7 +243,9 @@ export class AzureTTSClient extends AbstractTTSClient { }) .catch((_error: any) => { // Log the actual error for debugging if needed: console.error("SDK Load Error:", _error); - console.warn("microsoft-cognitiveservices-speech-sdk not found or failed to load, using REST API fallback for word boundaries."); + console.warn( + "microsoft-cognitiveservices-speech-sdk not found or failed to load, using REST API fallback for word boundaries." + ); this.sdkLoadingPromise = null; // Reset promise on error this.sdk = null; // Ensure SDK is null if loading failed return null; // Indicate SDK load failed @@ -264,16 +269,21 @@ export class AzureTTSClient extends AbstractTTSClient { wordBoundaries: Array<{ text: string; offset: number; duration: number }>; }> { try { - if (!sdkInstance) { // Should not happen if called correctly, but good practice + if (!sdkInstance) { + // Should not happen if called correctly, but good practice throw new Error("Attempted to use SDK method, but SDK instance is missing."); } // Create a speech config - const speechConfig = sdkInstance.SpeechConfig.fromSubscription(this.subscriptionKey, this.region); + const speechConfig = sdkInstance.SpeechConfig.fromSubscription( + this.subscriptionKey, + this.region + ); // Set the output format - speechConfig.speechSynthesisOutputFormat = options?.format === "mp3" - ? sdkInstance.SpeechSynthesisOutputFormat.Audio24Khz96KBitRateMonoMp3 - : sdkInstance.SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm; + speechConfig.speechSynthesisOutputFormat = + options?.format === "mp3" + ? sdkInstance.SpeechSynthesisOutputFormat.Audio24Khz96KBitRateMonoMp3 + : sdkInstance.SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm; // Set the voice if (this.voiceId) { @@ -449,12 +459,16 @@ export class AzureTTSClient extends AbstractTTSClient { try { // Create a speech config - const speechConfig = this.sdk.SpeechConfig.fromSubscription(this.subscriptionKey, this.region); + const speechConfig = this.sdk.SpeechConfig.fromSubscription( + this.subscriptionKey, + this.region + ); // Set the output format - speechConfig.speechSynthesisOutputFormat = options?.format === "mp3" - ? this.sdk.SpeechSynthesisOutputFormat.Audio24Khz96KBitRateMonoMp3 - : this.sdk.SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm; + speechConfig.speechSynthesisOutputFormat = + options?.format === "mp3" + ? this.sdk.SpeechSynthesisOutputFormat.Audio24Khz96KBitRateMonoMp3 + : this.sdk.SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm; // Set the voice if (this.voiceId) { @@ -542,17 +556,17 @@ export class AzureTTSClient extends AbstractTTSClient { const voiceId = options?.voice || this.voiceId; // Validate and process SSML for Azure compatibility - const validation = SSMLUtils.validateSSMLForEngine(ssml, 'azure', voiceId || undefined); + const validation = SSMLUtils.validateSSMLForEngine(ssml, "azure", voiceId || undefined); if (validation.warnings.length > 0) { - console.warn('Azure SSML warnings:', validation.warnings); + console.warn("Azure SSML warnings:", validation.warnings); } if (!validation.isValid) { - console.error('Azure SSML validation errors:', validation.errors); - throw new Error(`Invalid SSML for Azure: ${validation.errors.join(', ')}`); + console.error("Azure SSML validation errors:", validation.errors); + throw new Error(`Invalid SSML for Azure: ${validation.errors.join(", ")}`); } // Process SSML for Azure compatibility - ssml = SSMLUtils.processSSMLForEngine(ssml, 'azure', voiceId || undefined); + ssml = SSMLUtils.processSSMLForEngine(ssml, "azure", voiceId || undefined); // Ensure proper SSML structure for Azure ssml = this.ensureAzureSSMLStructure(ssml, voiceId, options); @@ -567,32 +581,36 @@ export class AzureTTSClient extends AbstractTTSClient { * @param options Synthesis options * @returns Properly structured SSML for Azure */ - private ensureAzureSSMLStructure(ssml: string, voiceId?: string | null, options?: AzureTTSOptions): string { + private ensureAzureSSMLStructure( + ssml: string, + voiceId?: string | null, + options?: AzureTTSOptions + ): string { // Check if SSML contains mstts-specific tags const hasMsttsContent = /mstts:/.test(ssml); // Ensure required attributes are present - if (!ssml.includes('version=')) { - ssml = ssml.replace(']*>(.*?)<\/speak>/s); if (speakMatch) { const content = speakMatch[1].trim(); - const speakTag = ssml.substring(0, ssml.indexOf('>') + 1); + const speakTag = ssml.substring(0, ssml.indexOf(">") + 1); ssml = `${speakTag}${content}`; } } @@ -645,5 +663,5 @@ export class AzureTTSClient extends AbstractTTSClient { * Extended options for Azure TTS */ export interface AzureTTSOptions extends SpeakOptions { - format?: 'mp3' | 'wav'; // Define formats supported by this client logic + format?: "mp3" | "wav"; // Define formats supported by this client logic } diff --git a/src/engines/elevenlabs.ts b/src/engines/elevenlabs.ts index bcafc83..b4e826a 100644 --- a/src/engines/elevenlabs.ts +++ b/src/engines/elevenlabs.ts @@ -1,6 +1,7 @@ import { AbstractTTSClient } from "../core/abstract-tts"; import * as SpeechMarkdown from "../markdown/converter"; import type { SpeakOptions, TTSCredentials, UnifiedVoice, WordBoundaryCallback } from "../types"; +import { base64ToUint8Array } from "../utils/base64-utils"; import { getFetch } from "../utils/fetch-utils"; // Get the fetch implementation for the current environment @@ -517,8 +518,7 @@ export class ElevenLabsTTSClient extends AbstractTTSClient { // Decode base64 audio data const audioBase64 = timestampResponse.audio_base64; - const audioBuffer = Buffer.from(audioBase64, "base64"); - audioData = new Uint8Array(audioBuffer); + audioData = base64ToUint8Array(audioBase64); // Convert character timing to word boundaries and store for events if (timestampResponse.alignment) { @@ -611,8 +611,7 @@ export class ElevenLabsTTSClient extends AbstractTTSClient { // Decode base64 audio data const audioBase64 = timestampResponse.audio_base64; - const audioBuffer = Buffer.from(audioBase64, "base64"); - audioData = new Uint8Array(audioBuffer); + audioData = base64ToUint8Array(audioBase64); // Convert character timing to word boundaries if (timestampResponse.alignment) { diff --git a/src/engines/espeak.ts b/src/engines/espeak.ts index 59541db..362a421 100644 --- a/src/engines/espeak.ts +++ b/src/engines/espeak.ts @@ -1,7 +1,7 @@ -import { AbstractTTSClient } from "../core/abstract-tts"; -import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; import { createRequire } from "node:module"; import path from "node:path"; +import { AbstractTTSClient } from "../core/abstract-tts"; +import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; // Dynamic text2wav module - will be loaded when needed let text2wav: any = null; @@ -31,7 +31,11 @@ async function loadText2Wav() { try { const req = typeof input === "string" ? input : input?.url; // Intercept attempts to fetch the text2wav WASM by plain file path (no scheme) - if (typeof req === "string" && req.endsWith("espeak-ng.wasm") && !/^https?:|^file:/.test(req)) { + if ( + typeof req === "string" && + req.endsWith("espeak-ng.wasm") && + !/^https?:|^file:/.test(req) + ) { const { readFileSync } = await import("node:fs"); const requireFromCwd = createRequire(path.join(process.cwd(), "noop.js")); const wasmPath = requireFromCwd.resolve("text2wav/lib/espeak-ng.wasm"); diff --git a/src/engines/fishaudio.ts b/src/engines/fishaudio.ts index e054727..fc520c5 100644 --- a/src/engines/fishaudio.ts +++ b/src/engines/fishaudio.ts @@ -127,14 +127,11 @@ export class FishAudioTTSClient extends AbstractTTSClient { async checkCredentials(): Promise { if (!this.apiKey) return false; try { - const response = await fetch(`${this.baseUrl}/v1/tts`, { - method: "POST", + const response = await fetch(`${this.baseUrl}/v1/model`, { + method: "GET", headers: { - "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}`, - model: this.model, }, - body: JSON.stringify({ text: "test" }), }); return response.ok; } catch { diff --git a/src/engines/google.ts b/src/engines/google.ts index f6f6f2b..3398532 100644 --- a/src/engines/google.ts +++ b/src/engines/google.ts @@ -1,7 +1,7 @@ import { AbstractTTSClient } from "../core/abstract-tts"; -import type { SpeakOptions, TTSCredentials, UnifiedVoice, WordBoundaryCallback } from "../types"; import * as SSMLUtils from "../core/ssml-utils"; import * as SpeechMarkdown from "../markdown/converter"; +import type { SpeakOptions, TTSCredentials, UnifiedVoice, WordBoundaryCallback } from "../types"; // Dynamic import for Google Cloud Text-to-Speech (Node.js only) // This avoids browser import errors for Node.js-only packages @@ -36,7 +36,7 @@ export interface GoogleTTSCredentials extends TTSCredentials { * Extended options for Google TTS */ export interface GoogleTTSOptions extends SpeakOptions { - format?: 'mp3' | 'wav'; // Define formats supported by this client logic (maps to LINEAR16) + format?: "mp3" | "wav"; // Define formats supported by this client logic (maps to LINEAR16) } /** @@ -69,8 +69,6 @@ export class GoogleTTSClient extends AbstractTTSClient { this.googleCredentials = credentials; this.client = null; - - // Don't initialize the client here - do it lazily on first use // This follows the same pattern as Polly and Azure engines } @@ -89,7 +87,7 @@ export class GoogleTTSClient extends AbstractTTSClient { try { // Try to load the Google Cloud Text-to-Speech client (Node.js only) - const dynamicImport: any = new Function('m', 'return import(m)'); + const dynamicImport: any = new Function("m", "return import(m)"); const ttsModule = await dynamicImport("@google-cloud/text-to-speech"); const { TextToSpeechClient } = ttsModule; @@ -115,9 +113,7 @@ export class GoogleTTSClient extends AbstractTTSClient { // In test mode, we'll just log a warning instead of an error if (process.env.NODE_ENV === "test") { - console.warn( - "Google TTS client not initialized in test mode. Some tests may be skipped." - ); + console.warn("Google TTS client not initialized in test mode. Some tests may be skipped."); } else { console.warn( "Google TTS will not be available. Install @google-cloud/text-to-speech to use this engine." @@ -170,14 +166,18 @@ export class GoogleTTSClient extends AbstractTTSClient { try { const ssml = await this.prepareSSML(text, options); const voiceName = options?.voice || this.voiceId; - const supportsSSML = !voiceName || (voiceName.includes("Standard") || voiceName.includes("Wavenet")); + const supportsSSML = + !voiceName || voiceName.includes("Standard") || voiceName.includes("Wavenet"); let languageCode = this.lang || "en-US"; if (voiceName) { const parts = voiceName.split("-"); if (parts.length >= 2) languageCode = `${parts[0]}-${parts[1]}`; } const request: any = { - input: supportsSSML && SSMLUtils.isSSML(ssml) ? { ssml } : { text: SSMLUtils.isSSML(ssml) ? SSMLUtils.stripSSML(ssml) : ssml }, + input: + supportsSSML && SSMLUtils.isSSML(ssml) + ? { ssml } + : { text: SSMLUtils.isSSML(ssml) ? SSMLUtils.stripSSML(ssml) : ssml }, voice: { languageCode, name: voiceName }, audioConfig: { audioEncoding: options?.format === "mp3" ? "MP3" : "LINEAR16" }, }; @@ -214,7 +214,8 @@ export class GoogleTTSClient extends AbstractTTSClient { // Check if the voice supports SSML const voiceName = options?.voice || this.voiceId; // Only Standard and Wavenet voices support SSML - const supportsSSML = !voiceName || (voiceName.includes("Standard") || voiceName.includes("Wavenet")); + const supportsSSML = + !voiceName || voiceName.includes("Standard") || voiceName.includes("Wavenet"); // Extract language code from voice name if available let languageCode = this.lang || "en-US"; @@ -228,7 +229,10 @@ export class GoogleTTSClient extends AbstractTTSClient { // Prepare the request const request: any = { - input: supportsSSML && SSMLUtils.isSSML(ssml) ? { ssml } : { text: SSMLUtils.isSSML(ssml) ? SSMLUtils.stripSSML(ssml) : ssml }, + input: + supportsSSML && SSMLUtils.isSSML(ssml) + ? { ssml } + : { text: SSMLUtils.isSSML(ssml) ? SSMLUtils.stripSSML(ssml) : ssml }, voice: { languageCode: languageCode, name: voiceName, @@ -254,12 +258,12 @@ export class GoogleTTSClient extends AbstractTTSClient { } // Synthesize speech - let response; + let response: any; if (useWordTimings) { // Use beta API for word timings try { // Use dynamic import for ESM compatibility without static specifiers for bundlers - const dynamicImport: any = new Function('m', 'return import(m)'); + const dynamicImport: any = new Function("m", "return import(m)"); const ttsModule = await dynamicImport("@google-cloud/text-to-speech"); const betaClient = new ttsModule.v1beta1.TextToSpeechClient({ projectId: this.googleCredentials.projectId, @@ -268,7 +272,10 @@ export class GoogleTTSClient extends AbstractTTSClient { }); [response] = await betaClient.synthesizeSpeech(request); } catch (error) { - console.warn("Error using beta API for word timings, falling back to standard API:", error); + console.warn( + "Error using beta API for word timings, falling back to standard API:", + error + ); [response] = await this.client.synthesizeSpeech(request); } } else { @@ -277,17 +284,25 @@ export class GoogleTTSClient extends AbstractTTSClient { } // Process word timings if available - if (useWordTimings && response && 'timepoints' in response && Array.isArray(response.timepoints)) { - this.processTimepoints(response.timepoints as Array<{markName: string; timeSeconds: number}>, text); + if ( + useWordTimings && + response && + "timepoints" in response && + Array.isArray(response.timepoints) + ) { + this.processTimepoints( + response.timepoints as Array<{ markName: string; timeSeconds: number }>, + text + ); } else { // Create estimated word timings this._createEstimatedWordTimings(text); } // Return audio content, ensuring it's a Uint8Array - return response && response.audioContent ? - new Uint8Array(response.audioContent as Uint8Array) : - new Uint8Array(0); + return response && response.audioContent + ? new Uint8Array(response.audioContent as Uint8Array) + : new Uint8Array(0); } catch (error) { console.error("Error synthesizing speech:", error); throw error; @@ -300,7 +315,10 @@ export class GoogleTTSClient extends AbstractTTSClient { * @param options Synthesis options * @returns Promise resolving to an object containing the audio stream and word boundaries */ - async synthToBytestream(text: string, options?: GoogleTTSOptions): Promise<{ + async synthToBytestream( + text: string, + options?: GoogleTTSOptions + ): Promise<{ audioStream: ReadableStream; wordBoundaries: Array<{ text: string; offset: number; duration: number }>; }> { @@ -333,15 +351,17 @@ export class GoogleTTSClient extends AbstractTTSClient { }); // Always return the structure, populate boundaries only if requested AND available - const finalBoundaries = options?.useWordBoundary ? this.timings.map(([start, end, word]) => ({ - text: word, - offset: Math.round(start * 10000), // Convert to 100-nanosecond units - duration: Math.round((end - start) * 10000), - })) : []; + const finalBoundaries = options?.useWordBoundary + ? this.timings.map(([start, end, word]) => ({ + text: word, + offset: Math.round(start * 10000), // Convert to 100-nanosecond units + duration: Math.round((end - start) * 10000), + })) + : []; return { audioStream: stream, - wordBoundaries: finalBoundaries + wordBoundaries: finalBoundaries, }; } catch (error) { console.error("Error synthesizing speech stream:", error); @@ -383,10 +403,10 @@ export class GoogleTTSClient extends AbstractTTSClient { // Convert Google voices to unified format return rawVoices.map((voice: any) => ({ id: voice.name, - name: voice.name || 'Unknown', + name: voice.name || "Unknown", gender: voice.ssmlGender?.toLowerCase() || undefined, languageCodes: voice.languageCodes, - provider: 'google' as const, + provider: "google" as const, raw: voice, // Keep the original raw voice data })); } @@ -501,7 +521,10 @@ export class GoogleTTSClient extends AbstractTTSClient { * @param timepoints Timepoints from Google TTS response * @param text Original text */ - private processTimepoints(timepoints: Array<{markName: string; timeSeconds: number}>, text: string): void { + private processTimepoints( + timepoints: Array<{ markName: string; timeSeconds: number }>, + text: string + ): void { // Extract plain text from SSML if needed const plainText = SSMLUtils.isSSML(text) ? SSMLUtils.stripSSML(text) : text; @@ -518,11 +541,10 @@ export class GoogleTTSClient extends AbstractTTSClient { if (wordIndex >= 0 && wordIndex < words.length) { const word = words[wordIndex]; - const startTime = timepoint.timeSeconds; // Estimate end time (next timepoint or start + word length * average time per character) - let endTime; + let endTime: number; if (i < timepoints.length - 1) { endTime = timepoints[i + 1].timeSeconds; } else { @@ -543,7 +565,7 @@ export class GoogleTTSClient extends AbstractTTSClient { * @returns Array of required credential field names */ protected getRequiredCredentials(): string[] { - return ['keyFilename']; // Primary credential type, though projectId and credentials are also supported + return ["keyFilename"]; // Primary credential type, though projectId and credentials are also supported } /** @@ -553,7 +575,10 @@ export class GoogleTTSClient extends AbstractTTSClient { async checkCredentials(): Promise { // If using API key mode, consider presence of key as valid for basic checks if (this.googleCredentials.apiKey) { - return typeof this.googleCredentials.apiKey === "string" && this.googleCredentials.apiKey.length > 10; + return ( + typeof this.googleCredentials.apiKey === "string" && + this.googleCredentials.apiKey.length > 10 + ); } // If the client is not available, check if the credentials file exists @@ -561,8 +586,8 @@ export class GoogleTTSClient extends AbstractTTSClient { try { // Only import fs in Node.js environment if (typeof window === "undefined") { - const dyn: any = new Function('m','return import(m)'); - const fs = await dyn('node:fs'); + const dyn: any = new Function("m", "return import(m)"); + const fs = await dyn("node:fs"); const credentials = this.credentials as GoogleTTSCredentials; // Check if the keyFilename exists @@ -571,8 +596,10 @@ export class GoogleTTSClient extends AbstractTTSClient { } // Check if the GOOGLE_APPLICATION_CREDENTIALS environment variable is set - if (process.env.GOOGLE_APPLICATION_CREDENTIALS && - fs.existsSync(process.env.GOOGLE_APPLICATION_CREDENTIALS)) { + if ( + process.env.GOOGLE_APPLICATION_CREDENTIALS && + fs.existsSync(process.env.GOOGLE_APPLICATION_CREDENTIALS) + ) { return true; } @@ -599,7 +626,11 @@ export class GoogleTTSClient extends AbstractTTSClient { * Check if credentials are valid with detailed response * @returns Promise resolving to an object with success flag and optional error message */ - async checkCredentialsDetailed(): Promise<{ success: boolean; error?: string; voiceCount?: number }> { + async checkCredentialsDetailed(): Promise<{ + success: boolean; + error?: string; + voiceCount?: number; + }> { // API key mode: try listing voices to validate the key if (this.googleCredentials.apiKey) { try { @@ -609,7 +640,7 @@ export class GoogleTTSClient extends AbstractTTSClient { } catch (error) { return { success: false, - error: error instanceof Error ? error.message : String(error) + error: error instanceof Error ? error.message : String(error), }; } } @@ -619,8 +650,8 @@ export class GoogleTTSClient extends AbstractTTSClient { try { // Only import fs in Node.js environment if (typeof window === "undefined") { - const dyn: any = new Function('m','return import(m)'); - const fs = await dyn('node:fs'); + const dyn: any = new Function("m", "return import(m)"); + const fs = await dyn("node:fs"); const credentials = this.credentials as GoogleTTSCredentials; // Check if the keyFilename exists @@ -629,32 +660,40 @@ export class GoogleTTSClient extends AbstractTTSClient { } // Check if the GOOGLE_APPLICATION_CREDENTIALS environment variable is set - if (process.env.GOOGLE_APPLICATION_CREDENTIALS && - fs.existsSync(process.env.GOOGLE_APPLICATION_CREDENTIALS)) { - return { success: true, error: "GOOGLE_APPLICATION_CREDENTIALS file exists but client not initialized" }; + if ( + process.env.GOOGLE_APPLICATION_CREDENTIALS && + fs.existsSync(process.env.GOOGLE_APPLICATION_CREDENTIALS) + ) { + return { + success: true, + error: "GOOGLE_APPLICATION_CREDENTIALS file exists but client not initialized", + }; } // Check if the GOOGLE_SA_PATH environment variable is set if (process.env.GOOGLE_SA_PATH && fs.existsSync(process.env.GOOGLE_SA_PATH)) { - return { success: true, error: "GOOGLE_SA_PATH file exists but client not initialized" }; + return { + success: true, + error: "GOOGLE_SA_PATH file exists but client not initialized", + }; } return { success: false, - error: "No valid credentials file found" + error: "No valid credentials file found", }; } else { // In browser environment without apiKey, we can't check file existence return { success: false, - error: "Cannot check Google credentials file existence in browser environment" + error: "Cannot check Google credentials file existence in browser environment", }; } } catch (error) { console.error("Error checking Google credentials:", error); return { success: false, - error: error instanceof Error ? error.message : String(error) + error: error instanceof Error ? error.message : String(error), }; } } @@ -692,11 +731,7 @@ export class GoogleTTSClient extends AbstractTTSClient { private base64ToBytes(b64: string): Uint8Array { // Node path - // eslint-disable-next-line @typescript-eslint/ban-ts-comment - // @ts-ignore if (typeof Buffer !== "undefined" && typeof (Buffer as any).from === "function") { - // eslint-disable-next-line @typescript-eslint/ban-ts-comment - // @ts-ignore const buf = (Buffer as any).from(b64, "base64"); return buf instanceof Uint8Array ? buf : new Uint8Array(buf); } @@ -706,5 +741,4 @@ export class GoogleTTSClient extends AbstractTTSClient { for (let i = 0; i < binary.length; i++) out[i] = binary.charCodeAt(i); return out; } - } diff --git a/src/engines/hume.ts b/src/engines/hume.ts index bc737da..1baada2 100644 --- a/src/engines/hume.ts +++ b/src/engines/hume.ts @@ -25,6 +25,25 @@ export class HumeTTSClient extends AbstractTTSClient { private baseUrl: string; private model: string; + static readonly VOICES = [ + { id: "ito", name: "Ito", gender: "Unknown" as const, language: "en" }, + { id: "acantha", name: "Acantha", gender: "Unknown" as const, language: "en" }, + { id: "ant ai gonus", name: "Antigonos", gender: "Unknown" as const, language: "en" }, + { id: "ari", name: "Ari", gender: "Unknown" as const, language: "en" }, + { id: "brant", name: "Brant", gender: "Unknown" as const, language: "en" }, + { id: "daniel", name: "Daniel", gender: "Unknown" as const, language: "en" }, + { id: "fin", name: "Fin", gender: "Unknown" as const, language: "en" }, + { id: "hype", name: "Hype", gender: "Unknown" as const, language: "en" }, + { id: "kora", name: "Kora", gender: "Unknown" as const, language: "en" }, + { id: "mango", name: "Mango", gender: "Unknown" as const, language: "en" }, + { id: "marek", name: "Marek", gender: "Unknown" as const, language: "en" }, + { id: "ogma", name: "Ogma", gender: "Unknown" as const, language: "en" }, + { id: "sora", name: "Sora", gender: "Unknown" as const, language: "en" }, + { id: "terrence", name: "Terrence", gender: "Unknown" as const, language: "en" }, + { id: "vitor", name: "Vitor", gender: "Unknown" as const, language: "en" }, + { id: "zach", name: "Zach", gender: "Unknown" as const, language: "en" }, + ]; + constructor(credentials: HumeTTSCredentials = {}) { super(credentials); this.apiKey = credentials.apiKey || process.env.HUME_API_KEY || ""; @@ -124,9 +143,9 @@ export class HumeTTSClient extends AbstractTTSClient { "Content-Type": "application/json", "X-Hume-Api-Key": this.apiKey, }, - body: JSON.stringify({ utterances: [{ text: "test" }] }), + body: JSON.stringify({ utterances: [{ text: "t" }] }), }); - return response.ok; + return response.status !== 401 && response.status !== 403; } catch { return false; } @@ -137,11 +156,23 @@ export class HumeTTSClient extends AbstractTTSClient { } protected async _getVoices(): Promise { - return []; + return HumeTTSClient.VOICES; } - protected async _mapVoicesToUnified(_rawVoices: any[]): Promise { - return []; + protected async _mapVoicesToUnified(rawVoices: any[]): Promise { + return rawVoices.map((voice) => ({ + id: voice.id, + name: voice.name, + gender: voice.gender as "Male" | "Female" | "Unknown", + languageCodes: [ + { + bcp47: voice.language || "en", + iso639_3: (voice.language || "en").split("-")[0], + display: voice.language || "English", + }, + ], + provider: "hume" as any, + })); } async synthToBytes(text: string, options: HumeTTSOptions = {}): Promise { diff --git a/src/engines/mistral.ts b/src/engines/mistral.ts index 251de4c..e130c1f 100644 --- a/src/engines/mistral.ts +++ b/src/engines/mistral.ts @@ -2,6 +2,7 @@ import { AbstractTTSClient } from "../core/abstract-tts"; import * as SSMLUtils from "../core/ssml-utils"; import * as SpeechMarkdown from "../markdown/converter"; import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; +import { base64ToUint8Array } from "../utils/base64-utils"; import { getFetch } from "../utils/fetch-utils"; const fetch = getFetch(); @@ -27,6 +28,36 @@ export class MistralTTSClient extends AbstractTTSClient { private model: string; private responseFormat: string; + static readonly VOICES = [ + { id: "Amalthea", name: "Amalthea", gender: "Unknown" as const, language: "en" }, + { id: "Achan", name: "Achan", gender: "Unknown" as const, language: "en" }, + { id: "Brave", name: "Brave", gender: "Unknown" as const, language: "en" }, + { id: "Contessa", name: "Contessa", gender: "Unknown" as const, language: "en" }, + { id: "Daintree", name: "Daintree", gender: "Unknown" as const, language: "en" }, + { id: "Eugora", name: "Eugora", gender: "Unknown" as const, language: "en" }, + { id: "Fornax", name: "Fornax", gender: "Unknown" as const, language: "en" }, + { id: "Griffin", name: "Griffin", gender: "Unknown" as const, language: "en" }, + { id: "Hestia", name: "Hestia", gender: "Unknown" as const, language: "en" }, + { id: "Irving", name: "Irving", gender: "Unknown" as const, language: "en" }, + { id: "Jasmine", name: "Jasmine", gender: "Unknown" as const, language: "en" }, + { id: "Kestra", name: "Kestra", gender: "Unknown" as const, language: "en" }, + { id: "Lorentz", name: "Lorentz", gender: "Unknown" as const, language: "en" }, + { id: "Mara", name: "Mara", gender: "Unknown" as const, language: "en" }, + { id: "Nettle", name: "Nettle", gender: "Unknown" as const, language: "en" }, + { id: "Orin", name: "Orin", gender: "Unknown" as const, language: "en" }, + { id: "Puck", name: "Puck", gender: "Unknown" as const, language: "en" }, + { id: "Quinn", name: "Quinn", gender: "Unknown" as const, language: "en" }, + { id: "Rune", name: "Rune", gender: "Unknown" as const, language: "en" }, + { id: "Simbe", name: "Simbe", gender: "Unknown" as const, language: "en" }, + { id: "Tertia", name: "Tertia", gender: "Unknown" as const, language: "en" }, + { id: "Umbriel", name: "Umbriel", gender: "Unknown" as const, language: "en" }, + { id: "Vesta", name: "Vesta", gender: "Unknown" as const, language: "en" }, + { id: "Wystan", name: "Wystan", gender: "Unknown" as const, language: "en" }, + { id: "Xeno", name: "Xeno", gender: "Unknown" as const, language: "en" }, + { id: "Yara", name: "Yara", gender: "Unknown" as const, language: "en" }, + { id: "Zephyr", name: "Zephyr", gender: "Unknown" as const, language: "en" }, + ]; + constructor(credentials: MistralTTSCredentials = {}) { super(credentials); this.apiKey = credentials.apiKey || process.env.MISTRAL_API_KEY || ""; @@ -137,11 +168,23 @@ export class MistralTTSClient extends AbstractTTSClient { } protected async _getVoices(): Promise { - return []; + return MistralTTSClient.VOICES; } - protected async _mapVoicesToUnified(_rawVoices: any[]): Promise { - return []; + protected async _mapVoicesToUnified(rawVoices: any[]): Promise { + return rawVoices.map((voice) => ({ + id: voice.id, + name: voice.name, + gender: voice.gender as "Male" | "Female" | "Unknown", + languageCodes: [ + { + bcp47: voice.language || "en", + iso639_3: (voice.language || "en").split("-")[0], + display: voice.language || "English", + }, + ], + provider: "mistral" as any, + })); } async synthToBytes(text: string, options: MistralTTSOptions = {}): Promise { @@ -176,14 +219,8 @@ export class MistralTTSClient extends AbstractTTSClient { } const json = (await response.json()) as { audio_data: string }; - const binaryStr = atob(json.audio_data); - const bytes = new Uint8Array(binaryStr.length); - for (let i = 0; i < binaryStr.length; i++) { - bytes[i] = binaryStr.charCodeAt(i); - } - this._createEstimatedWordTimings(preparedText); - return bytes; + return base64ToUint8Array(json.audio_data); } async synthToBytestream( @@ -266,12 +303,7 @@ export class MistralTTSClient extends AbstractTTSClient { try { const json = JSON.parse(data); if (json.type === "speech.audio.delta" && typeof json.audio_data === "string") { - const binaryStr = atob(json.audio_data); - const bytes = new Uint8Array(binaryStr.length); - for (let i = 0; i < binaryStr.length; i++) { - bytes[i] = binaryStr.charCodeAt(i); - } - controller.enqueue(bytes); + controller.enqueue(base64ToUint8Array(json.audio_data)); } } catch { /* skip malformed */ diff --git a/src/engines/modelslab.ts b/src/engines/modelslab.ts index ef42aef..78a17f3 100644 --- a/src/engines/modelslab.ts +++ b/src/engines/modelslab.ts @@ -27,16 +27,70 @@ export interface ModelsLabTTSOptions extends SpeakOptions { /** Static list of available voices */ const MODELSLAB_VOICES: UnifiedVoice[] = [ // Emotion-capable female voices - { id: "madison", name: "Madison", gender: "Female", provider: "modelslab", languageCodes: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }] }, - { id: "tara", name: "Tara", gender: "Female", provider: "modelslab", languageCodes: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }] }, - { id: "leah", name: "Leah", gender: "Female", provider: "modelslab", languageCodes: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }] }, - { id: "jess", name: "Jess", gender: "Female", provider: "modelslab", languageCodes: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }] }, - { id: "mia", name: "Mia", gender: "Female", provider: "modelslab", languageCodes: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }] }, - { id: "zoe", name: "Zoe", gender: "Female", provider: "modelslab", languageCodes: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }] }, + { + id: "madison", + name: "Madison", + gender: "Female", + provider: "modelslab", + languageCodes: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }], + }, + { + id: "tara", + name: "Tara", + gender: "Female", + provider: "modelslab", + languageCodes: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }], + }, + { + id: "leah", + name: "Leah", + gender: "Female", + provider: "modelslab", + languageCodes: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }], + }, + { + id: "jess", + name: "Jess", + gender: "Female", + provider: "modelslab", + languageCodes: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }], + }, + { + id: "mia", + name: "Mia", + gender: "Female", + provider: "modelslab", + languageCodes: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }], + }, + { + id: "zoe", + name: "Zoe", + gender: "Female", + provider: "modelslab", + languageCodes: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }], + }, // Emotion-capable male voices - { id: "leo", name: "Leo", gender: "Male", provider: "modelslab", languageCodes: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }] }, - { id: "dan", name: "Dan", gender: "Male", provider: "modelslab", languageCodes: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }] }, - { id: "zac", name: "Zac", gender: "Male", provider: "modelslab", languageCodes: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }] }, + { + id: "leo", + name: "Leo", + gender: "Male", + provider: "modelslab", + languageCodes: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }], + }, + { + id: "dan", + name: "Dan", + gender: "Male", + provider: "modelslab", + languageCodes: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }], + }, + { + id: "zac", + name: "Zac", + gender: "Male", + provider: "modelslab", + languageCodes: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }], + }, ]; const API_URL = "https://modelslab.com/api/v6/voice/text_to_speech"; @@ -67,7 +121,7 @@ export class ModelsLabTTSClient extends AbstractTTSClient { super(credentials); this.apiKey = credentials.apiKey || - (typeof process !== "undefined" ? process.env.MODELSLAB_API_KEY ?? "" : ""); + (typeof process !== "undefined" ? (process.env.MODELSLAB_API_KEY ?? "") : ""); this.defaultLanguage = DEFAULT_LANGUAGE; this.defaultSpeed = 1.0; if (!this.voiceId) { @@ -142,7 +196,13 @@ export class ModelsLabTTSClient extends AbstractTTSClient { const speed = options.speed ?? this.defaultSpeed; const language = options.language ?? this.defaultLanguage; - const audioBytes = await this._synthesize(processedText, voiceId, language, speed, options.emotion ?? false); + const audioBytes = await this._synthesize( + processedText, + voiceId, + language, + speed, + options.emotion ?? false + ); const audioStream = new ReadableStream({ start(controller) { @@ -215,10 +275,7 @@ export class ModelsLabTTSClient extends AbstractTTSClient { } /** Poll the fetch_result URL until audio is ready. */ - private async _poll( - fetchUrl: string, - fetch: ReturnType - ): Promise { + private async _poll(fetchUrl: string, fetch: ReturnType): Promise { for (let attempt = 0; attempt < MAX_POLL_ATTEMPTS; attempt++) { await this._sleep(POLL_INTERVAL_MS); diff --git a/src/engines/murf.ts b/src/engines/murf.ts index 8da77a5..5759027 100644 --- a/src/engines/murf.ts +++ b/src/engines/murf.ts @@ -2,6 +2,7 @@ import { AbstractTTSClient } from "../core/abstract-tts"; import * as SSMLUtils from "../core/ssml-utils"; import * as SpeechMarkdown from "../markdown/converter"; import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; +import { base64ToUint8Array } from "../utils/base64-utils"; import { getFetch } from "../utils/fetch-utils"; const fetch = getFetch(); @@ -209,11 +210,7 @@ export class MurfTTSClient extends AbstractTTSClient { } const json = (await response.json()) as { encodedAudio: string }; - const binaryStr = atob(json.encodedAudio); - const bytes = new Uint8Array(binaryStr.length); - for (let i = 0; i < binaryStr.length; i++) { - bytes[i] = binaryStr.charCodeAt(i); - } + const bytes = base64ToUint8Array(json.encodedAudio); this._createEstimatedWordTimings(preparedText); return bytes; } diff --git a/src/engines/openai.ts b/src/engines/openai.ts index 8b6c01a..f7ca703 100644 --- a/src/engines/openai.ts +++ b/src/engines/openai.ts @@ -1,9 +1,9 @@ // Node-only imports moved inside Node-only code paths below for browser compatibility. import { AbstractTTSClient } from "../core/abstract-tts"; -import * as SpeechMarkdown from "../markdown/converter"; import * as SSMLUtils from "../core/ssml-utils"; +import * as SpeechMarkdown from "../markdown/converter"; import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; -import { type WordBoundary, estimateWordBoundaries } from "../utils/word-timing-estimator"; +import { estimateWordBoundaries, type WordBoundary } from "../utils/word-timing-estimator"; // Mock OpenAI types for TypeScript compilation // These will be replaced by the actual types when the openai package is installed @@ -58,7 +58,7 @@ export interface OpenAITTSOptions extends SpeakOptions { /** OpenAI Speed (maps to rate) */ speed?: number; /** Output format */ - format?: 'mp3' | 'opus' | 'aac' | 'flac' | 'wav' | 'pcm'; + format?: "mp3" | "opus" | "aac" | "flac" | "wav" | "pcm"; /** * Output directory for audio files @@ -174,7 +174,7 @@ export class OpenAITTSClient extends AbstractTTSClient { return this.client; } - this.clientLoadingPromise = (new Function('m','return import(m)') as any)("openai") + this.clientLoadingPromise = (new Function("m", "return import(m)") as any)("openai") .then((openaiModule: any) => { const OpenAIClass = openaiModule.OpenAI; this.client = new OpenAIClass({ @@ -250,7 +250,7 @@ export class OpenAITTSClient extends AbstractTTSClient { * @returns Array of required credential field names */ protected getRequiredCredentials(): string[] { - return ['apiKey']; + return ["apiKey"]; } /** @@ -391,7 +391,9 @@ export class OpenAITTSClient extends AbstractTTSClient { */ async textToSpeech(text: string, options: OpenAITTSOptions = {}): Promise { if (typeof window !== "undefined") { - throw new Error("textToSpeech with file output is not supported in the browser. Use synthToBytes or synthToBytestream instead."); + throw new Error( + "textToSpeech with file output is not supported in the browser. Use synthToBytes or synthToBytestream instead." + ); } // Node.js only const importNodeBuiltin = async (name: string) => import(`node:${name}`); @@ -436,7 +438,9 @@ export class OpenAITTSClient extends AbstractTTSClient { */ async textToSpeechStreaming(text: string, options: OpenAITTSOptions = {}): Promise { if (typeof window !== "undefined") { - throw new Error("textToSpeechStreaming with file output is not supported in the browser. Use synthToBytes or synthToBytestream instead."); + throw new Error( + "textToSpeechStreaming with file output is not supported in the browser. Use synthToBytes or synthToBytestream instead." + ); } const importNodeBuiltin = async (name: string) => import(`node:${name}`); const fs = await importNodeBuiltin("fs"); @@ -588,7 +592,10 @@ export class OpenAITTSClient extends AbstractTTSClient { * @param _options Synthesis options (currently unused for streaming, uses defaults). * @returns Promise resolving to an object containing the audio stream and an empty word boundaries array. */ - async synthToBytestream(text: string, _options?: SpeakOptions): Promise<{ + async synthToBytestream( + text: string, + _options?: SpeakOptions + ): Promise<{ audioStream: ReadableStream; wordBoundaries: Array<{ text: string; offset: number; duration: number }>; }> { diff --git a/src/engines/playht.ts b/src/engines/playht.ts index 1923861..52222a6 100644 --- a/src/engines/playht.ts +++ b/src/engines/playht.ts @@ -1,9 +1,10 @@ import { AbstractTTSClient } from "../core/abstract-tts"; -import * as SpeechMarkdown from "../markdown/converter"; import * as SSMLUtils from "../core/ssml-utils"; +import * as SpeechMarkdown from "../markdown/converter"; import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; -import { estimateWordBoundaries, type WordBoundary } from "../utils/word-timing-estimator"; import { getFetch } from "../utils/fetch-utils"; +import { estimateWordBoundaries, type WordBoundary } from "../utils/word-timing-estimator"; + // Node-only imports moved inside Node-only code paths below for browser compatibility. // Get the fetch implementation for the current environment diff --git a/src/engines/polly.ts b/src/engines/polly.ts index 3d213ad..0bc8cbb 100644 --- a/src/engines/polly.ts +++ b/src/engines/polly.ts @@ -1,19 +1,18 @@ +import type { + SynthesizeSpeechCommandInput, + SynthesizeSpeechCommandOutput, +} from "@aws-sdk/client-polly"; import { AbstractTTSClient } from "../core/abstract-tts"; import * as SSMLUtils from "../core/ssml-utils"; -import type { - SpeakOptions, - TTSCredentials, - UnifiedVoice, -} from "../types"; import * as SpeechMarkdown from "../markdown/converter"; -import type { SynthesizeSpeechCommandInput, SynthesizeSpeechCommandOutput } from "@aws-sdk/client-polly"; +import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; import { streamToBuffer } from "../utils/stream-utils"; /** * Extended options for Polly TTS */ export interface PollyTTSOptions extends SpeakOptions { - format?: 'mp3' | 'wav' | 'ogg'; // Define formats supported by this client logic + format?: "mp3" | "wav" | "ogg"; // Define formats supported by this client logic filePath?: string; // Path to save the file (if provided, it's for file saving, not playback) } @@ -83,7 +82,9 @@ export class PollyTTSClient extends AbstractTTSClient { */ protected async _getVoices(): Promise { try { - const pollyModule = this._pollyModule || (await (new Function('m','return import(m)') as any)("@aws-sdk/client-polly")); + const pollyModule = + this._pollyModule || + (await (new Function("m", "return import(m)") as any)("@aws-sdk/client-polly")); if (!this.client) { const PollyClient = pollyModule.PollyClient; this.client = new PollyClient({ @@ -115,7 +116,7 @@ export class PollyTTSClient extends AbstractTTSClient { // Populate the voice cache for engine detection this.voiceCache.clear(); - voices.forEach(voice => { + voices.forEach((voice) => { this.voiceCache.set(voice.id, voice); }); @@ -228,8 +229,6 @@ export class PollyTTSClient extends AbstractTTSClient { } } - - /** * Prepare SSML for AWS Polly based on voice engine capabilities * @param text Text or SSML to prepare @@ -263,17 +262,17 @@ export class PollyTTSClient extends AbstractTTSClient { } // Validate and process SSML for Polly compatibility - const validation = SSMLUtils.validateSSMLForEngine(text, 'polly', voiceId); + const validation = SSMLUtils.validateSSMLForEngine(text, "polly", voiceId); if (validation.warnings.length > 0) { - console.warn('Polly SSML warnings:', validation.warnings); + console.warn("Polly SSML warnings:", validation.warnings); } if (!validation.isValid) { - console.error('Polly SSML validation errors:', validation.errors); - throw new Error(`Invalid SSML for Polly: ${validation.errors.join(', ')}`); + console.error("Polly SSML validation errors:", validation.errors); + throw new Error(`Invalid SSML for Polly: ${validation.errors.join(", ")}`); } // Process SSML for Polly compatibility (removes unsupported tags based on voice type) - text = SSMLUtils.processSSMLForEngine(text, 'polly', voiceId); + text = SSMLUtils.processSSMLForEngine(text, "polly", voiceId); // Get SSML support level for additional processing const ssmlSupport = await this.getSSMLSupportLevel(voiceId); @@ -287,7 +286,7 @@ export class PollyTTSClient extends AbstractTTSClient { } // 2. Fix any self-closing tags that Polly doesn't support - text = text.replace(/]+)\/>/gi, ''); + text = text.replace(/]+)\/>/gi, ""); // 3. Apply prosody settings if needed if ( @@ -304,7 +303,7 @@ export class PollyTTSClient extends AbstractTTSClient { const prosodyContent = this.constructProsodyTag(content); // Put back inside speak tags with the original attributes - const openingTag = text.substring(0, text.indexOf('>') + 1); + const openingTag = text.substring(0, text.indexOf(">") + 1); text = `${openingTag}${prosodyContent}`; } } @@ -319,12 +318,11 @@ export class PollyTTSClient extends AbstractTTSClient { * @param options Synthesis options * @returns Promise resolving to audio bytes */ - async synthToBytes( - text: string, - options?: PollyTTSOptions - ): Promise { + async synthToBytes(text: string, options?: PollyTTSOptions): Promise { try { - const pollyModule = this._pollyModule || (await (new Function('m','return import(m)') as any)("@aws-sdk/client-polly")); + const pollyModule = + this._pollyModule || + (await (new Function("m", "return import(m)") as any)("@aws-sdk/client-polly")); if (!this.client) { const PollyClient = pollyModule.PollyClient; this.client = new PollyClient({ @@ -340,19 +338,14 @@ export class PollyTTSClient extends AbstractTTSClient { const { OutputFormat, SynthesizeSpeechCommand, VoiceId } = pollyModule; // Determine the output format - // For Polly, we always request PCM for WAV (so we can add the header) - // and MP3/OGG directly for those formats const requestedFormat = options?.format || "wav"; - let outputFormat; + let outputFormat: any; if (requestedFormat === "mp3") { - // Request MP3 directly from Polly outputFormat = OutputFormat.MP3; } else if (requestedFormat === "ogg") { - // Request OGG directly from Polly outputFormat = OutputFormat.OGG_VORBIS; } else { - // For WAV, request PCM and we'll add the WAV header outputFormat = OutputFormat.PCM; } @@ -437,7 +430,9 @@ export class PollyTTSClient extends AbstractTTSClient { wordBoundaries: Array<{ text: string; offset: number; duration: number }>; }> { try { - const pollyModule = this._pollyModule || (await (new Function('m','return import(m)') as any)("@aws-sdk/client-polly")); + const pollyModule = + this._pollyModule || + (await (new Function("m", "return import(m)") as any)("@aws-sdk/client-polly")); if (!this.client) { const PollyClient = pollyModule.PollyClient; this.client = new PollyClient({ @@ -449,7 +444,13 @@ export class PollyTTSClient extends AbstractTTSClient { }); this._pollyModule = pollyModule; } - const { OutputFormat, SynthesizeSpeechCommandInput, SynthesizeSpeechCommand, VoiceId, SpeechMarkType } = pollyModule; + const { + OutputFormat, + SynthesizeSpeechCommandInput, + SynthesizeSpeechCommand, + VoiceId, + SpeechMarkType, + } = pollyModule; const VoiceIdType = VoiceId; // Get the RUNTIME VoiceId enum/object const voiceIdString = options?.voice || this.voiceId || "Joanna"; const voiceId = voiceIdString as unknown as typeof VoiceIdType; // Cast via unknown @@ -467,7 +468,7 @@ export class PollyTTSClient extends AbstractTTSClient { const { Engine } = pollyModule; const engine = Engine[engineString as keyof typeof Engine] || Engine.standard; - let wordBoundaries: Array<{ text: string; offset: number; duration: number }> = []; + const wordBoundaries: Array<{ text: string; offset: number; duration: number }> = []; // Request Speech Marks (JSON) try { @@ -509,11 +510,8 @@ export class PollyTTSClient extends AbstractTTSClient { // Caller should check wordBoundaries array length if marks are critical } - // Request Audio Stream (PCM/MP3/OGG) - // For Polly, we always request PCM for WAV (so we can add the header) - // and MP3/OGG directly for those formats const requestedFormat = options?.format || "wav"; - let outputFormat; + let outputFormat: any; if (requestedFormat === "mp3") { // Request MP3 directly from Polly @@ -571,7 +569,7 @@ export class PollyTTSClient extends AbstractTTSClient { start(controller) { controller.enqueue(wavData); controller.close(); - } + }, }); } catch (error) { console.error("Error adding WAV header to PCM stream:", error); @@ -628,7 +626,11 @@ export class PollyTTSClient extends AbstractTTSClient { * @param sampleRate Sample rate in Hz (default: 16000) * @returns PCM audio data with WAV header */ - private addWavHeader(pcmData: Uint8Array, sampleRate: number = 16000, _isForPlayback: boolean = false): Uint8Array { + private addWavHeader( + pcmData: Uint8Array, + sampleRate: number = 16000, + _isForPlayback: boolean = false + ): Uint8Array { // Always use 16000 Hz for Polly PCM data to match the Python implementation // The Python implementation uses wav.setparams((1, 2, 16000, 0, "NONE", "NONE")) sampleRate = 16000; @@ -646,10 +648,10 @@ export class PollyTTSClient extends AbstractTTSClient { // Chunk size (file size - 8) const fileSize = pcmData.length + headerSize - 8; - wavData[4] = fileSize & 0xFF; - wavData[5] = (fileSize >> 8) & 0xFF; - wavData[6] = (fileSize >> 16) & 0xFF; - wavData[7] = (fileSize >> 24) & 0xFF; + wavData[4] = fileSize & 0xff; + wavData[5] = (fileSize >> 8) & 0xff; + wavData[6] = (fileSize >> 16) & 0xff; + wavData[7] = (fileSize >> 24) & 0xff; // "WAVE" format wavData[8] = 0x57; // 'W' @@ -659,7 +661,7 @@ export class PollyTTSClient extends AbstractTTSClient { // "fmt " sub-chunk wavData[12] = 0x66; // 'f' - wavData[13] = 0x6D; // 'm' + wavData[13] = 0x6d; // 'm' wavData[14] = 0x74; // 't' wavData[15] = 0x20; // ' ' @@ -678,17 +680,17 @@ export class PollyTTSClient extends AbstractTTSClient { wavData[23] = 0; // Sample rate (always 16000 Hz for Polly PCM) - wavData[24] = sampleRate & 0xFF; - wavData[25] = (sampleRate >> 8) & 0xFF; - wavData[26] = (sampleRate >> 16) & 0xFF; - wavData[27] = (sampleRate >> 24) & 0xFF; + wavData[24] = sampleRate & 0xff; + wavData[25] = (sampleRate >> 8) & 0xff; + wavData[26] = (sampleRate >> 16) & 0xff; + wavData[27] = (sampleRate >> 24) & 0xff; // Byte rate (SampleRate * NumChannels * BitsPerSample/8) - const byteRate = sampleRate * 1 * 16 / 8; - wavData[28] = byteRate & 0xFF; - wavData[29] = (byteRate >> 8) & 0xFF; - wavData[30] = (byteRate >> 16) & 0xFF; - wavData[31] = (byteRate >> 24) & 0xFF; + const byteRate = (sampleRate * 1 * 16) / 8; + wavData[28] = byteRate & 0xff; + wavData[29] = (byteRate >> 8) & 0xff; + wavData[30] = (byteRate >> 16) & 0xff; + wavData[31] = (byteRate >> 24) & 0xff; // Block align (NumChannels * BitsPerSample/8) wavData[32] = 2; // 1 * 16 / 8 @@ -705,10 +707,10 @@ export class PollyTTSClient extends AbstractTTSClient { wavData[39] = 0x61; // 'a' // Sub-chunk size (data size) - wavData[40] = pcmData.length & 0xFF; - wavData[41] = (pcmData.length >> 8) & 0xFF; - wavData[42] = (pcmData.length >> 16) & 0xFF; - wavData[43] = (pcmData.length >> 24) & 0xFF; + wavData[40] = pcmData.length & 0xff; + wavData[41] = (pcmData.length >> 8) & 0xff; + wavData[42] = (pcmData.length >> 16) & 0xff; + wavData[43] = (pcmData.length >> 24) & 0xff; // Copy PCM data after header wavData.set(pcmData, headerSize); @@ -721,7 +723,7 @@ export class PollyTTSClient extends AbstractTTSClient { * @returns Array of required credential field names */ protected getRequiredCredentials(): string[] { - return ['region', 'accessKeyId', 'secretAccessKey']; + return ["region", "accessKeyId", "secretAccessKey"]; } /** @@ -731,18 +733,20 @@ export class PollyTTSClient extends AbstractTTSClient { async checkCredentials(): Promise { // If a client is injected, attempt a lightweight API call const injected = (this.credentials as any)?.client; - if (!injected) { + if (injected) { + this.client = injected; + } else { // Fast-fail if required credentials are missing to avoid importing SDK in CI/tests const { region, accessKeyId, secretAccessKey } = this.credentials as any; if (!region || !accessKeyId || !secretAccessKey) { return false; } - } else { - this.client = injected; } try { - const pollyModule = this._pollyModule || (await (new Function('m','return import(m)') as any)("@aws-sdk/client-polly")); + const pollyModule = + this._pollyModule || + (await (new Function("m", "return import(m)") as any)("@aws-sdk/client-polly")); if (!this.client) { const PollyClient = pollyModule.PollyClient; this.client = new PollyClient({ diff --git a/src/engines/resemble.ts b/src/engines/resemble.ts index 253f43f..007933c 100644 --- a/src/engines/resemble.ts +++ b/src/engines/resemble.ts @@ -2,6 +2,7 @@ import { AbstractTTSClient } from "../core/abstract-tts"; import * as SSMLUtils from "../core/ssml-utils"; import * as SpeechMarkdown from "../markdown/converter"; import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; +import { base64ToUint8Array } from "../utils/base64-utils"; import { getFetch } from "../utils/fetch-utils"; const fetch = getFetch(); @@ -99,15 +100,13 @@ export class ResembleTTSClient extends AbstractTTSClient { async checkCredentials(): Promise { if (!this.apiKey) return false; try { - const response = await fetch(`${this.baseUrl}/synthesize`, { - method: "POST", + const response = await fetch(`${this.baseUrl}/v2/voices`, { + method: "GET", headers: { - "Content-Type": "application/json", Authorization: this.apiKey, }, - body: JSON.stringify({ voice_uuid: "test", data: "test" }), }); - return response.status !== 401; + return response.ok; } catch { return false; } @@ -118,11 +117,35 @@ export class ResembleTTSClient extends AbstractTTSClient { } protected async _getVoices(): Promise { - return []; + try { + const response = await fetch(`${this.baseUrl}/v2/voices`, { + method: "GET", + headers: { + Authorization: this.apiKey, + }, + }); + if (!response.ok) return []; + const data = await response.json(); + return Array.isArray(data) ? data : []; + } catch { + return []; + } } - protected async _mapVoicesToUnified(_rawVoices: any[]): Promise { - return []; + protected async _mapVoicesToUnified(rawVoices: any[]): Promise { + return rawVoices.map((voice: any) => ({ + id: voice.uuid || voice.id || voice.voice_uuid, + name: voice.name || voice.uuid || voice.id, + gender: (voice.gender || "Unknown") as "Male" | "Female" | "Unknown", + languageCodes: [ + { + bcp47: voice.language || "en", + iso639_3: (voice.language || "en").split("-")[0], + display: voice.language || "English", + }, + ], + provider: "resemble" as any, + })); } async synthToBytes(text: string, options: ResembleTTSOptions = {}): Promise { @@ -152,11 +175,7 @@ export class ResembleTTSClient extends AbstractTTSClient { } const json = (await response.json()) as { audio_content: string }; - const binaryStr = atob(json.audio_content); - const bytes = new Uint8Array(binaryStr.length); - for (let i = 0; i < binaryStr.length; i++) { - bytes[i] = binaryStr.charCodeAt(i); - } + const bytes = base64ToUint8Array(json.audio_content); this._createEstimatedWordTimings(preparedText); return bytes; diff --git a/src/engines/sapi.ts b/src/engines/sapi.ts index 49c8d64..6af203a 100644 --- a/src/engines/sapi.ts +++ b/src/engines/sapi.ts @@ -1,10 +1,10 @@ -import { AbstractTTSClient } from "../core/abstract-tts"; -import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; -import * as SpeechMarkdown from "../markdown/converter"; import { spawn } from "node:child_process"; -import { readFileSync, unlinkSync, existsSync } from "node:fs"; +import { existsSync, readFileSync, unlinkSync } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; +import { AbstractTTSClient } from "../core/abstract-tts"; +import * as SpeechMarkdown from "../markdown/converter"; +import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; /** * SAPI TTS Client Credentials @@ -80,7 +80,7 @@ export class SAPITTSClient extends AbstractTTSClient { async checkCredentials(): Promise { try { this.validateEnvironment(); - + // Test if PowerShell and System.Speech are available const testScript = ` try { @@ -136,7 +136,9 @@ export class SAPITTSClient extends AbstractTTSClient { const parsedResult = JSON.parse(result); // Ensure we have an array (PowerShell returns single object when there's only one voice) - const voiceData: SAPIVoiceInfo[] = Array.isArray(parsedResult) ? parsedResult : [parsedResult]; + const voiceData: SAPIVoiceInfo[] = Array.isArray(parsedResult) + ? parsedResult + : [parsedResult]; // Convert to unified voice format const unifiedVoices: UnifiedVoice[] = voiceData.map((voice) => ({ @@ -334,13 +336,17 @@ export class SAPITTSClient extends AbstractTTSClient { $synth = [System.Speech.Synthesis.SpeechSynthesizer]::new() # Set voice if specified - ${voiceName ? ` + ${ + voiceName + ? ` try { $synth.SelectVoice("${this.escapePowerShellString(voiceName)}") } catch { # If voice selection fails, continue with default voice Write-Warning "Could not select voice '${this.escapePowerShellString(voiceName)}', using default voice" - }` : ""} + }` + : "" + } # Set speech properties $synth.Rate = ${rate} @@ -383,7 +389,9 @@ export class SAPITTSClient extends AbstractTTSClient { } // Create estimated word timings (SAPI doesn't provide real-time events in this mode) - this._createEstimatedWordTimings(isSSMLProcessed ? this.stripSSML(processedText) : processedText); + this._createEstimatedWordTimings( + isSSMLProcessed ? this.stripSSML(processedText) : processedText + ); return new Uint8Array(audioBuffer); } catch (error) { @@ -480,10 +488,11 @@ export class SAPITTSClient extends AbstractTTSClient { return 4; case "x-fast": return 8; - default: + default: { // Try to parse as number const parsed = Number.parseFloat(rate); return Number.isNaN(parsed) ? 0 : Math.max(-10, Math.min(10, parsed)); + } } } @@ -518,10 +527,11 @@ export class SAPITTSClient extends AbstractTTSClient { return 80; case "x-loud": return 100; - default: + default: { // Try to parse as number const parsed = Number.parseFloat(volume); return Number.isNaN(parsed) ? 100 : Math.max(0, Math.min(100, parsed)); + } } } @@ -539,11 +549,7 @@ export class SAPITTSClient extends AbstractTTSClient { * @returns Escaped string */ private escapePowerShellString(str: string): string { - return str - .replace(/\\/g, "\\\\") - .replace(/"/g, '""') - .replace(/`/g, "``") - .replace(/\$/g, "`$"); + return str.replace(/\\/g, "\\\\").replace(/"/g, '""').replace(/`/g, "``").replace(/\$/g, "`$"); } /** @@ -566,26 +572,29 @@ export class SAPITTSClient extends AbstractTTSClient { const trimmedText = text.trim(); // Check if the SSML already has version attribute - if (trimmedText.includes('version=')) { + if (trimmedText.includes("version=")) { return text; // Return original text to preserve formatting } // Check for double wrapping (defensive programming) // This prevents the bug where SSML gets wrapped twice - if (trimmedText.includes(' tag, add the version attribute // Note: SAPI requires xml:lang="en" (not "en-US") for SSML to work properly - if (trimmedText.startsWith('')) { - return text.replace('', ''); + if (trimmedText.startsWith("")) { + return text.replace("", ''); } // If it doesn't start with , wrap it properly // Note: SAPI requires xml:lang="en" (not "en-US") for SSML to work properly - if (!trimmedText.startsWith('${text}`; } diff --git a/src/engines/sherpaonnx-wasm.ts b/src/engines/sherpaonnx-wasm.ts index 04d6746..f8660fe 100644 --- a/src/engines/sherpaonnx-wasm.ts +++ b/src/engines/sherpaonnx-wasm.ts @@ -9,12 +9,12 @@ */ import { AbstractTTSClient } from "../core/abstract-tts"; -import * as SpeechMarkdown from "../markdown/converter"; import * as SSMLUtils from "../core/ssml-utils"; +import * as SpeechMarkdown from "../markdown/converter"; import type { SpeakOptions, TTSCredentials, UnifiedVoice, WordBoundaryCallback } from "../types"; +import { decompressBzip2 } from "../utils/bzip2"; import { fileSystem, isBrowser, isNode, pathUtils } from "../utils/environment"; import { estimateWordBoundaries } from "../utils/word-timing-estimator"; -import { decompressBzip2 } from "../utils/bzip2"; // Enhanced model type definitions for multi-model support export type ModelType = "kokoro" | "matcha" | "vits" | "mms"; diff --git a/src/engines/sherpaonnx.ts b/src/engines/sherpaonnx.ts index 50d6259..c8296ec 100644 --- a/src/engines/sherpaonnx.ts +++ b/src/engines/sherpaonnx.ts @@ -12,11 +12,10 @@ import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; // Capture native fetch at module level const nativeFetch = globalThis.fetch; -// Import the generated models config -import { SHERPA_MODELS_CONFIG } from "./sherpaonnx/generated_models"; - // Import the sherpaonnx-loader import * as sherpaOnnxLoaderModule from "../utils/sherpaonnx-loader"; +// Import the generated models config +import { SHERPA_MODELS_CONFIG } from "./sherpaonnx/generated_models"; // Module scope variables to hold the imported modules let sherpa: any; diff --git a/src/engines/upliftai.ts b/src/engines/upliftai.ts index 22b0c7a..935d1ec 100644 --- a/src/engines/upliftai.ts +++ b/src/engines/upliftai.ts @@ -1,6 +1,6 @@ import { AbstractTTSClient } from "../core/abstract-tts"; -import * as SpeechMarkdown from "../markdown/converter"; import * as SSMLUtils from "../core/ssml-utils"; +import * as SpeechMarkdown from "../markdown/converter"; import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; import { getFetch } from "../utils/fetch-utils"; diff --git a/src/engines/watson.ts b/src/engines/watson.ts index 7501d5d..1d783c9 100644 --- a/src/engines/watson.ts +++ b/src/engines/watson.ts @@ -210,7 +210,7 @@ export class WatsonTTSClient extends AbstractTTSClient { * @returns Array of required credential field names */ protected getRequiredCredentials(): string[] { - return ['apiKey', 'region', 'instanceId']; + return ["apiKey", "region", "instanceId"]; } /** diff --git a/src/engines/witai.ts b/src/engines/witai.ts index 32aaf4b..bb13b8b 100644 --- a/src/engines/witai.ts +++ b/src/engines/witai.ts @@ -63,7 +63,7 @@ export class WitAITTSClient extends AbstractTTSClient { * @returns Array of required credential field names */ protected getRequiredCredentials(): string[] { - return ['token']; + return ["token"]; } /** @@ -196,7 +196,9 @@ export class WitAITTSClient extends AbstractTTSClient { try { // Prepare text for synthesis (strip SSML/Markdown if present) const preparedText = await this.prepareText(text, options); - console.debug(`${this.constructor.name}.synthToBytes - TTS text ${preparedText}, Options: ${JSON.stringify(options)}`); + console.debug( + `${this.constructor.name}.synthToBytes - TTS text ${preparedText}, Options: ${JSON.stringify(options)}` + ); // Use provided voice or the one set with setVoice let voice = options?.voice || this.voiceId; diff --git a/src/engines/xai.ts b/src/engines/xai.ts index c469e9a..8313807 100644 --- a/src/engines/xai.ts +++ b/src/engines/xai.ts @@ -6,8 +6,6 @@ import { getFetch } from "../utils/fetch-utils"; const fetch = getFetch(); -const AUDIO_TAG_REGEX = /\[[^\]]+\]/g; - export interface XaiTTSOptions extends SpeakOptions { model?: string; voice?: string; @@ -76,7 +74,6 @@ export class XaiTTSClient extends AbstractTTSClient { } private processAudioTags(text: string): string { - if (!AUDIO_TAG_REGEX.test(text)) return text; return text; } diff --git a/src/markdown/converter-browser.ts b/src/markdown/converter-browser.ts index 818c8bd..5041b93 100644 --- a/src/markdown/converter-browser.ts +++ b/src/markdown/converter-browser.ts @@ -1,5 +1,5 @@ -import { isBrowser } from "../utils/environment"; import { SpeechMarkdown as SMSpeechMarkdown } from "speechmarkdown-js"; +import { isBrowser } from "../utils/environment"; export interface SpeechMarkdownRuntimeConfig { enabled?: boolean; diff --git a/src/utils/base64-utils.ts b/src/utils/base64-utils.ts new file mode 100644 index 0000000..e5bc025 --- /dev/null +++ b/src/utils/base64-utils.ts @@ -0,0 +1,8 @@ +export function base64ToUint8Array(b64: string): Uint8Array { + const binaryStr = atob(b64); + const bytes = new Uint8Array(binaryStr.length); + for (let i = 0; i < binaryStr.length; i++) { + bytes[i] = binaryStr.charCodeAt(i); + } + return bytes; +} diff --git a/src/utils/sherpaonnx-loader.js b/src/utils/sherpaonnx-loader.js index 2ab3eda..b397619 100644 --- a/src/utils/sherpaonnx-loader.js +++ b/src/utils/sherpaonnx-loader.js @@ -108,21 +108,25 @@ async function loadSherpaOnnxNode() { // If loading fails, provide helpful error message console.warn( "Failed to load sherpa-onnx-node directly. " + - "This might be because the environment variables need to be set before the Node.js process starts." + "This might be because the environment variables need to be set before the Node.js process starts." ); console.error("\nTo use SherpaOnnx TTS, you need to:"); console.error("1. Install the sherpa-onnx-node package: npm run install:sherpaonnx"); console.error("2. Run your application with the correct environment variables:"); console.error(` - On macOS: DYLD_LIBRARY_PATH=${libraryPath} node your-script.js`); - console.error(" - On Linux: LD_LIBRARY_PATH=/path/to/node_modules/sherpa-onnx-linux-x64 node your-script.js"); + console.error( + " - On Linux: LD_LIBRARY_PATH=/path/to/node_modules/sherpa-onnx-linux-x64 node your-script.js" + ); console.error(" - On Windows: No special environment variable needed"); - console.error("3. Or use the helper script: node scripts/run-with-sherpaonnx.cjs your-script.js"); + console.error( + "3. Or use the helper script: node scripts/run-with-sherpaonnx.cjs your-script.js" + ); throw new Error( "Could not load sherpa-onnx-node. " + - "Please use the run-with-sherpaonnx.cjs script to run your application: " + - "node scripts/run-with-sherpaonnx.cjs your-script.js" + "Please use the run-with-sherpaonnx.cjs script to run your application: " + + "node scripts/run-with-sherpaonnx.cjs your-script.js" ); } } diff --git a/src/utils/stream-utils.ts b/src/utils/stream-utils.ts index aa697dc..ae720f0 100644 --- a/src/utils/stream-utils.ts +++ b/src/utils/stream-utils.ts @@ -1,7 +1,6 @@ -import { isNode } from "./environment"; // Assuming this utility exists - // Import Node.js stream type if needed import type { Readable } from "node:stream"; +import { isNode } from "./environment"; // Assuming this utility exists /** * Reads a ReadableStream (Web) or NodeJS.ReadableStream completely From 7d51571f2ad422b28783ad23610bbb161fc3235c Mon Sep 17 00:00:00 2001 From: will wade Date: Wed, 8 Apr 2026 14:46:22 +0100 Subject: [PATCH 4/7] fix: normalize language codes across all new engines - Add shared language-utils.ts with toIso639_3(), toLanguageDisplay() - Fix bcp47: use full BCP-47 codes (en-US not en) in static voice lists - Fix iso639_3: use proper 3-letter ISO 639-3 codes (eng not en) - Fix display: use human-readable names (English (US) not en-US) - Applies to: cartesia, deepgram, hume, xai, fishaudio, mistral, murf, unrealspeech, resemble --- src/engines/cartesia.ts | 7 +- src/engines/deepgram.ts | 5 +- src/engines/fishaudio.ts | 7 +- src/engines/hume.ts | 39 ++-- src/engines/mistral.ts | 61 +++--- src/engines/murf.ts | 5 +- src/engines/resemble.ts | 7 +- src/engines/unrealspeech.ts | 5 +- src/engines/xai.ts | 19 +- src/utils/language-utils.ts | 416 ++++++++++++++++++++++++++++++++++++ 10 files changed, 498 insertions(+), 73 deletions(-) create mode 100644 src/utils/language-utils.ts diff --git a/src/engines/cartesia.ts b/src/engines/cartesia.ts index c7f7a8a..8fab400 100644 --- a/src/engines/cartesia.ts +++ b/src/engines/cartesia.ts @@ -3,6 +3,7 @@ import * as SSMLUtils from "../core/ssml-utils"; import * as SpeechMarkdown from "../markdown/converter"; import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; import { getFetch } from "../utils/fetch-utils"; +import { toIso639_3, toLanguageDisplay } from "../utils/language-utils"; const fetch = getFetch(); @@ -261,11 +262,11 @@ export class CartesiaTTSClient extends AbstractTTSClient { ? [ { bcp47: voice.language, - iso639_3: voice.language.split("-")[0], - display: voice.language, + iso639_3: toIso639_3(voice.language), + display: toLanguageDisplay(voice.language), }, ] - : [{ bcp47: "en-US", iso639_3: "eng", display: "English" }], + : [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }], provider: "cartesia" as any, })); } diff --git a/src/engines/deepgram.ts b/src/engines/deepgram.ts index 8299eeb..94f1178 100644 --- a/src/engines/deepgram.ts +++ b/src/engines/deepgram.ts @@ -3,6 +3,7 @@ import * as SSMLUtils from "../core/ssml-utils"; import * as SpeechMarkdown from "../markdown/converter"; import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; import { getFetch } from "../utils/fetch-utils"; +import { toIso639_3, toLanguageDisplay } from "../utils/language-utils"; const fetch = getFetch(); @@ -177,8 +178,8 @@ export class DeepgramTTSClient extends AbstractTTSClient { languageCodes: [ { bcp47: voice.language || "en-US", - iso639_3: (voice.language || "en-US").split("-")[0], - display: voice.language || "English (US)", + iso639_3: toIso639_3(voice.language || "en-US"), + display: toLanguageDisplay(voice.language || "en-US"), }, ], provider: "deepgram" as any, diff --git a/src/engines/fishaudio.ts b/src/engines/fishaudio.ts index fc520c5..c20e9af 100644 --- a/src/engines/fishaudio.ts +++ b/src/engines/fishaudio.ts @@ -3,6 +3,7 @@ import * as SSMLUtils from "../core/ssml-utils"; import * as SpeechMarkdown from "../markdown/converter"; import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; import { getFetch } from "../utils/fetch-utils"; +import { toIso639_3, toLanguageDisplay } from "../utils/language-utils"; const fetch = getFetch(); @@ -169,10 +170,10 @@ export class FishAudioTTSClient extends AbstractTTSClient { languageCodes: voice.languages ? voice.languages.map((lang: string) => ({ bcp47: lang, - iso639_3: lang.split("-")[0], - display: lang, + iso639_3: toIso639_3(lang), + display: toLanguageDisplay(lang), })) - : [{ bcp47: "en", iso639_3: "eng", display: "English" }], + : [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }], provider: "fishaudio" as any, })); } diff --git a/src/engines/hume.ts b/src/engines/hume.ts index 1baada2..1dee088 100644 --- a/src/engines/hume.ts +++ b/src/engines/hume.ts @@ -3,6 +3,7 @@ import * as SSMLUtils from "../core/ssml-utils"; import * as SpeechMarkdown from "../markdown/converter"; import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; import { getFetch } from "../utils/fetch-utils"; +import { toIso639_3, toLanguageDisplay } from "../utils/language-utils"; const fetch = getFetch(); @@ -26,22 +27,22 @@ export class HumeTTSClient extends AbstractTTSClient { private model: string; static readonly VOICES = [ - { id: "ito", name: "Ito", gender: "Unknown" as const, language: "en" }, - { id: "acantha", name: "Acantha", gender: "Unknown" as const, language: "en" }, - { id: "ant ai gonus", name: "Antigonos", gender: "Unknown" as const, language: "en" }, - { id: "ari", name: "Ari", gender: "Unknown" as const, language: "en" }, - { id: "brant", name: "Brant", gender: "Unknown" as const, language: "en" }, - { id: "daniel", name: "Daniel", gender: "Unknown" as const, language: "en" }, - { id: "fin", name: "Fin", gender: "Unknown" as const, language: "en" }, - { id: "hype", name: "Hype", gender: "Unknown" as const, language: "en" }, - { id: "kora", name: "Kora", gender: "Unknown" as const, language: "en" }, - { id: "mango", name: "Mango", gender: "Unknown" as const, language: "en" }, - { id: "marek", name: "Marek", gender: "Unknown" as const, language: "en" }, - { id: "ogma", name: "Ogma", gender: "Unknown" as const, language: "en" }, - { id: "sora", name: "Sora", gender: "Unknown" as const, language: "en" }, - { id: "terrence", name: "Terrence", gender: "Unknown" as const, language: "en" }, - { id: "vitor", name: "Vitor", gender: "Unknown" as const, language: "en" }, - { id: "zach", name: "Zach", gender: "Unknown" as const, language: "en" }, + { id: "ito", name: "Ito", gender: "Unknown" as const, language: "en-US" }, + { id: "acantha", name: "Acantha", gender: "Unknown" as const, language: "en-US" }, + { id: "ant ai gonus", name: "Antigonos", gender: "Unknown" as const, language: "en-US" }, + { id: "ari", name: "Ari", gender: "Unknown" as const, language: "en-US" }, + { id: "brant", name: "Brant", gender: "Unknown" as const, language: "en-US" }, + { id: "daniel", name: "Daniel", gender: "Unknown" as const, language: "en-US" }, + { id: "fin", name: "Fin", gender: "Unknown" as const, language: "en-US" }, + { id: "hype", name: "Hype", gender: "Unknown" as const, language: "en-US" }, + { id: "kora", name: "Kora", gender: "Unknown" as const, language: "en-US" }, + { id: "mango", name: "Mango", gender: "Unknown" as const, language: "en-US" }, + { id: "marek", name: "Marek", gender: "Unknown" as const, language: "en-US" }, + { id: "ogma", name: "Ogma", gender: "Unknown" as const, language: "en-US" }, + { id: "sora", name: "Sora", gender: "Unknown" as const, language: "en-US" }, + { id: "terrence", name: "Terrence", gender: "Unknown" as const, language: "en-US" }, + { id: "vitor", name: "Vitor", gender: "Unknown" as const, language: "en-US" }, + { id: "zach", name: "Zach", gender: "Unknown" as const, language: "en-US" }, ]; constructor(credentials: HumeTTSCredentials = {}) { @@ -166,9 +167,9 @@ export class HumeTTSClient extends AbstractTTSClient { gender: voice.gender as "Male" | "Female" | "Unknown", languageCodes: [ { - bcp47: voice.language || "en", - iso639_3: (voice.language || "en").split("-")[0], - display: voice.language || "English", + bcp47: voice.language || "en-US", + iso639_3: toIso639_3(voice.language || "en-US"), + display: toLanguageDisplay(voice.language || "en-US"), }, ], provider: "hume" as any, diff --git a/src/engines/mistral.ts b/src/engines/mistral.ts index e130c1f..4ec0ba9 100644 --- a/src/engines/mistral.ts +++ b/src/engines/mistral.ts @@ -4,6 +4,7 @@ import * as SpeechMarkdown from "../markdown/converter"; import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; import { base64ToUint8Array } from "../utils/base64-utils"; import { getFetch } from "../utils/fetch-utils"; +import { toIso639_3, toLanguageDisplay } from "../utils/language-utils"; const fetch = getFetch(); @@ -29,33 +30,33 @@ export class MistralTTSClient extends AbstractTTSClient { private responseFormat: string; static readonly VOICES = [ - { id: "Amalthea", name: "Amalthea", gender: "Unknown" as const, language: "en" }, - { id: "Achan", name: "Achan", gender: "Unknown" as const, language: "en" }, - { id: "Brave", name: "Brave", gender: "Unknown" as const, language: "en" }, - { id: "Contessa", name: "Contessa", gender: "Unknown" as const, language: "en" }, - { id: "Daintree", name: "Daintree", gender: "Unknown" as const, language: "en" }, - { id: "Eugora", name: "Eugora", gender: "Unknown" as const, language: "en" }, - { id: "Fornax", name: "Fornax", gender: "Unknown" as const, language: "en" }, - { id: "Griffin", name: "Griffin", gender: "Unknown" as const, language: "en" }, - { id: "Hestia", name: "Hestia", gender: "Unknown" as const, language: "en" }, - { id: "Irving", name: "Irving", gender: "Unknown" as const, language: "en" }, - { id: "Jasmine", name: "Jasmine", gender: "Unknown" as const, language: "en" }, - { id: "Kestra", name: "Kestra", gender: "Unknown" as const, language: "en" }, - { id: "Lorentz", name: "Lorentz", gender: "Unknown" as const, language: "en" }, - { id: "Mara", name: "Mara", gender: "Unknown" as const, language: "en" }, - { id: "Nettle", name: "Nettle", gender: "Unknown" as const, language: "en" }, - { id: "Orin", name: "Orin", gender: "Unknown" as const, language: "en" }, - { id: "Puck", name: "Puck", gender: "Unknown" as const, language: "en" }, - { id: "Quinn", name: "Quinn", gender: "Unknown" as const, language: "en" }, - { id: "Rune", name: "Rune", gender: "Unknown" as const, language: "en" }, - { id: "Simbe", name: "Simbe", gender: "Unknown" as const, language: "en" }, - { id: "Tertia", name: "Tertia", gender: "Unknown" as const, language: "en" }, - { id: "Umbriel", name: "Umbriel", gender: "Unknown" as const, language: "en" }, - { id: "Vesta", name: "Vesta", gender: "Unknown" as const, language: "en" }, - { id: "Wystan", name: "Wystan", gender: "Unknown" as const, language: "en" }, - { id: "Xeno", name: "Xeno", gender: "Unknown" as const, language: "en" }, - { id: "Yara", name: "Yara", gender: "Unknown" as const, language: "en" }, - { id: "Zephyr", name: "Zephyr", gender: "Unknown" as const, language: "en" }, + { id: "Amalthea", name: "Amalthea", gender: "Unknown" as const, language: "en-US" }, + { id: "Achan", name: "Achan", gender: "Unknown" as const, language: "en-US" }, + { id: "Brave", name: "Brave", gender: "Unknown" as const, language: "en-US" }, + { id: "Contessa", name: "Contessa", gender: "Unknown" as const, language: "en-US" }, + { id: "Daintree", name: "Daintree", gender: "Unknown" as const, language: "en-US" }, + { id: "Eugora", name: "Eugora", gender: "Unknown" as const, language: "en-US" }, + { id: "Fornax", name: "Fornax", gender: "Unknown" as const, language: "en-US" }, + { id: "Griffin", name: "Griffin", gender: "Unknown" as const, language: "en-US" }, + { id: "Hestia", name: "Hestia", gender: "Unknown" as const, language: "en-US" }, + { id: "Irving", name: "Irving", gender: "Unknown" as const, language: "en-US" }, + { id: "Jasmine", name: "Jasmine", gender: "Unknown" as const, language: "en-US" }, + { id: "Kestra", name: "Kestra", gender: "Unknown" as const, language: "en-US" }, + { id: "Lorentz", name: "Lorentz", gender: "Unknown" as const, language: "en-US" }, + { id: "Mara", name: "Mara", gender: "Unknown" as const, language: "en-US" }, + { id: "Nettle", name: "Nettle", gender: "Unknown" as const, language: "en-US" }, + { id: "Orin", name: "Orin", gender: "Unknown" as const, language: "en-US" }, + { id: "Puck", name: "Puck", gender: "Unknown" as const, language: "en-US" }, + { id: "Quinn", name: "Quinn", gender: "Unknown" as const, language: "en-US" }, + { id: "Rune", name: "Rune", gender: "Unknown" as const, language: "en-US" }, + { id: "Simbe", name: "Simbe", gender: "Unknown" as const, language: "en-US" }, + { id: "Tertia", name: "Tertia", gender: "Unknown" as const, language: "en-US" }, + { id: "Umbriel", name: "Umbriel", gender: "Unknown" as const, language: "en-US" }, + { id: "Vesta", name: "Vesta", gender: "Unknown" as const, language: "en-US" }, + { id: "Wystan", name: "Wystan", gender: "Unknown" as const, language: "en-US" }, + { id: "Xeno", name: "Xeno", gender: "Unknown" as const, language: "en-US" }, + { id: "Yara", name: "Yara", gender: "Unknown" as const, language: "en-US" }, + { id: "Zephyr", name: "Zephyr", gender: "Unknown" as const, language: "en-US" }, ]; constructor(credentials: MistralTTSCredentials = {}) { @@ -178,9 +179,9 @@ export class MistralTTSClient extends AbstractTTSClient { gender: voice.gender as "Male" | "Female" | "Unknown", languageCodes: [ { - bcp47: voice.language || "en", - iso639_3: (voice.language || "en").split("-")[0], - display: voice.language || "English", + bcp47: voice.language || "en-US", + iso639_3: toIso639_3(voice.language || "en-US"), + display: toLanguageDisplay(voice.language || "en-US"), }, ], provider: "mistral" as any, diff --git a/src/engines/murf.ts b/src/engines/murf.ts index 5759027..473ed62 100644 --- a/src/engines/murf.ts +++ b/src/engines/murf.ts @@ -4,6 +4,7 @@ import * as SpeechMarkdown from "../markdown/converter"; import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; import { base64ToUint8Array } from "../utils/base64-utils"; import { getFetch } from "../utils/fetch-utils"; +import { toIso639_3, toLanguageDisplay } from "../utils/language-utils"; const fetch = getFetch(); @@ -161,8 +162,8 @@ export class MurfTTSClient extends AbstractTTSClient { languageCodes: [ { bcp47: voice.language || "en-US", - iso639_3: (voice.language || "en-US").split("-")[0], - display: voice.language || "English (US)", + iso639_3: toIso639_3(voice.language || "en-US"), + display: toLanguageDisplay(voice.language || "en-US"), }, ], provider: "murf" as any, diff --git a/src/engines/resemble.ts b/src/engines/resemble.ts index 007933c..09b40b4 100644 --- a/src/engines/resemble.ts +++ b/src/engines/resemble.ts @@ -4,6 +4,7 @@ import * as SpeechMarkdown from "../markdown/converter"; import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; import { base64ToUint8Array } from "../utils/base64-utils"; import { getFetch } from "../utils/fetch-utils"; +import { toIso639_3, toLanguageDisplay } from "../utils/language-utils"; const fetch = getFetch(); @@ -139,9 +140,9 @@ export class ResembleTTSClient extends AbstractTTSClient { gender: (voice.gender || "Unknown") as "Male" | "Female" | "Unknown", languageCodes: [ { - bcp47: voice.language || "en", - iso639_3: (voice.language || "en").split("-")[0], - display: voice.language || "English", + bcp47: voice.language || "en-US", + iso639_3: toIso639_3(voice.language || "en-US"), + display: toLanguageDisplay(voice.language || "en-US"), }, ], provider: "resemble" as any, diff --git a/src/engines/unrealspeech.ts b/src/engines/unrealspeech.ts index 30c4b77..e591a0a 100644 --- a/src/engines/unrealspeech.ts +++ b/src/engines/unrealspeech.ts @@ -3,6 +3,7 @@ import * as SSMLUtils from "../core/ssml-utils"; import * as SpeechMarkdown from "../markdown/converter"; import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; import { getFetch } from "../utils/fetch-utils"; +import { toIso639_3, toLanguageDisplay } from "../utils/language-utils"; const fetch = getFetch(); @@ -146,8 +147,8 @@ export class UnrealSpeechTTSClient extends AbstractTTSClient { languageCodes: [ { bcp47: voice.language || "en-US", - iso639_3: (voice.language || "en-US").split("-")[0], - display: voice.language || "English (US)", + iso639_3: toIso639_3(voice.language || "en-US"), + display: toLanguageDisplay(voice.language || "en-US"), }, ], provider: "unrealspeech" as any, diff --git a/src/engines/xai.ts b/src/engines/xai.ts index 8313807..b3f1f12 100644 --- a/src/engines/xai.ts +++ b/src/engines/xai.ts @@ -3,6 +3,7 @@ import * as SSMLUtils from "../core/ssml-utils"; import * as SpeechMarkdown from "../markdown/converter"; import type { SpeakOptions, TTSCredentials, UnifiedVoice } from "../types"; import { getFetch } from "../utils/fetch-utils"; +import { toIso639_3, toLanguageDisplay } from "../utils/language-utils"; const fetch = getFetch(); @@ -28,12 +29,12 @@ export class XaiTTSClient extends AbstractTTSClient { private language: string; static readonly VOICES = [ - { id: "avalon-47", name: "Avalon", gender: "Female" as const, language: "en" }, - { id: "orion-56", name: "Orion", gender: "Male" as const, language: "en" }, - { id: "luna-30", name: "Luna", gender: "Female" as const, language: "en" }, - { id: "atlas-84", name: "Atlas", gender: "Male" as const, language: "en" }, - { id: "aria-42", name: "Aria", gender: "Female" as const, language: "en" }, - { id: "cosmo-01", name: "Cosmo", gender: "Male" as const, language: "en" }, + { id: "avalon-47", name: "Avalon", gender: "Female" as const, language: "en-US" }, + { id: "orion-56", name: "Orion", gender: "Male" as const, language: "en-US" }, + { id: "luna-30", name: "Luna", gender: "Female" as const, language: "en-US" }, + { id: "atlas-84", name: "Atlas", gender: "Male" as const, language: "en-US" }, + { id: "aria-42", name: "Aria", gender: "Female" as const, language: "en-US" }, + { id: "cosmo-01", name: "Cosmo", gender: "Male" as const, language: "en-US" }, ]; constructor(credentials: XaiTTSCredentials = {}) { @@ -163,9 +164,9 @@ export class XaiTTSClient extends AbstractTTSClient { gender: voice.gender as "Male" | "Female" | "Unknown", languageCodes: [ { - bcp47: voice.language || "en", - iso639_3: (voice.language || "en").split("-")[0], - display: voice.language || "English", + bcp47: voice.language || "en-US", + iso639_3: toIso639_3(voice.language || "en-US"), + display: toLanguageDisplay(voice.language || "en-US"), }, ], provider: "xai" as any, diff --git a/src/utils/language-utils.ts b/src/utils/language-utils.ts new file mode 100644 index 0000000..c8bda7a --- /dev/null +++ b/src/utils/language-utils.ts @@ -0,0 +1,416 @@ +const ISO_639_1_TO_639_3: Record = { + aa: "aar", + ab: "abk", + af: "afr", + ak: "aka", + am: "amh", + an: "arg", + ar: "ara", + as: "asm", + av: "ava", + ay: "aym", + az: "aze", + ba: "bak", + be: "bel", + bg: "bul", + bh: "bih", + bi: "bis", + bm: "bam", + bn: "ben", + bo: "bod", + br: "bre", + bs: "bos", + ca: "cat", + ce: "che", + ch: "cha", + co: "cos", + cr: "cre", + cs: "ces", + cu: "chu", + cv: "chv", + cy: "cym", + da: "dan", + de: "deu", + dv: "div", + dz: "dzo", + ee: "ewe", + el: "ell", + en: "eng", + eo: "epo", + es: "spa", + et: "est", + eu: "eus", + fa: "fas", + ff: "ful", + fi: "fin", + fj: "fij", + fo: "fao", + fr: "fra", + fy: "fry", + ga: "gle", + gd: "gla", + gl: "glg", + gn: "grn", + gu: "guj", + gv: "glv", + ha: "hau", + he: "heb", + hi: "hin", + ho: "hmo", + hr: "hrv", + ht: "hat", + hu: "hun", + hy: "hye", + hz: "her", + ia: "ina", + id: "ind", + ie: "ile", + ig: "ibo", + ii: "iii", + ik: "ipk", + io: "ido", + is: "isl", + it: "ita", + iu: "iku", + ja: "jpn", + jv: "jav", + ka: "kat", + kg: "kon", + ki: "kik", + kj: "kua", + kk: "kaz", + kl: "kal", + km: "khm", + kn: "kan", + ko: "kor", + kr: "kau", + ks: "kas", + ku: "kur", + kv: "kom", + kw: "cor", + ky: "kir", + la: "lat", + lb: "ltz", + lg: "lug", + li: "lim", + ln: "lin", + lo: "lao", + lt: "lit", + lu: "lub", + lv: "lav", + mg: "mlg", + mh: "mah", + mi: "mri", + mk: "mkd", + ml: "mal", + mn: "mon", + mr: "mar", + ms: "msa", + mt: "mlt", + my: "mya", + na: "nau", + nb: "nob", + nd: "nde", + ne: "nep", + ng: "ndo", + nl: "nld", + nn: "nno", + no: "nor", + nr: "nbl", + nv: "nav", + ny: "nya", + oc: "oci", + oj: "oji", + om: "orm", + or: "ori", + os: "oss", + pa: "pan", + pi: "pli", + pl: "pol", + ps: "pus", + pt: "por", + qu: "que", + rm: "roh", + rn: "run", + ro: "ron", + ru: "rus", + rw: "kin", + sa: "san", + sc: "srd", + sd: "snd", + se: "sme", + sg: "sag", + si: "sin", + sk: "slk", + sl: "slv", + sm: "smo", + sn: "sna", + so: "som", + sq: "sqi", + sr: "srp", + ss: "ssw", + st: "sot", + su: "sun", + sv: "swe", + sw: "swa", + ta: "tam", + te: "tel", + tg: "tgk", + th: "tha", + ti: "tir", + tk: "tuk", + tl: "tgl", + tn: "tsn", + to: "ton", + tr: "tur", + ts: "tso", + tt: "tat", + tw: "twi", + ty: "tah", + ug: "uig", + uk: "ukr", + ur: "urd", + uz: "uzb", + ve: "ven", + vi: "vie", + vo: "vol", + wa: "wln", + wo: "wol", + xh: "xho", + yi: "yid", + yo: "yor", + za: "zha", + zh: "zho", + zu: "zul", +}; + +const BCP47_TO_DISPLAY: Record = { + "af-ZA": "Afrikaans (South Africa)", + "am-ET": "Amharic (Ethiopia)", + "ar-AE": "Arabic (UAE)", + "ar-BH": "Arabic (Bahrain)", + "ar-DZ": "Arabic (Algeria)", + "ar-EG": "Arabic (Egypt)", + "ar-IQ": "Arabic (Iraq)", + "ar-JO": "Arabic (Jordan)", + "ar-KW": "Arabic (Kuwait)", + "ar-LB": "Arabic (Lebanon)", + "ar-LY": "Arabic (Libya)", + "ar-MA": "Arabic (Morocco)", + "ar-OM": "Arabic (Oman)", + "ar-QA": "Arabic (Qatar)", + "ar-SA": "Arabic (Saudi Arabia)", + "ar-SY": "Arabic (Syria)", + "ar-TN": "Arabic (Tunisia)", + "ar-YE": "Arabic (Yemen)", + "az-AZ": "Azerbaijani (Azerbaijan)", + "bg-BG": "Bulgarian (Bulgaria)", + "bn-BD": "Bengali (Bangladesh)", + "bn-IN": "Bengali (India)", + "ca-ES": "Catalan (Spain)", + "cs-CZ": "Czech (Czech Republic)", + "cy-GB": "Welsh (United Kingdom)", + "da-DK": "Danish (Denmark)", + "de-AT": "German (Austria)", + "de-CH": "German (Switzerland)", + "de-DE": "German (Germany)", + "el-GR": "Greek (Greece)", + "en-AU": "English (Australia)", + "en-CA": "English (Canada)", + "en-GB": "English (United Kingdom)", + "en-IE": "English (Ireland)", + "en-IN": "English (India)", + "en-NZ": "English (New Zealand)", + "en-PH": "English (Philippines)", + "en-SG": "English (Singapore)", + "en-US": "English (United States)", + "en-ZA": "English (South Africa)", + "es-AR": "Spanish (Argentina)", + "es-BO": "Spanish (Bolivia)", + "es-CL": "Spanish (Chile)", + "es-CO": "Spanish (Colombia)", + "es-CR": "Spanish (Costa Rica)", + "es-DO": "Spanish (Dominican Republic)", + "es-EC": "Spanish (Ecuador)", + "es-ES": "Spanish (Spain)", + "es-GT": "Spanish (Guatemala)", + "es-HN": "Spanish (Honduras)", + "es-MX": "Spanish (Mexico)", + "es-NI": "Spanish (Nicaragua)", + "es-PA": "Spanish (Panama)", + "es-PE": "Spanish (Peru)", + "es-PR": "Spanish (Puerto Rico)", + "es-PY": "Spanish (Paraguay)", + "es-SV": "Spanish (El Salvador)", + "es-US": "Spanish (United States)", + "es-UY": "Spanish (Uruguay)", + "es-VE": "Spanish (Venezuela)", + "et-EE": "Estonian (Estonia)", + "eu-ES": "Basque (Spain)", + "fa-IR": "Persian (Iran)", + "fi-FI": "Finnish (Finland)", + "fil-PH": "Filipino (Philippines)", + "fr-BE": "French (Belgium)", + "fr-CA": "French (Canada)", + "fr-CH": "French (Switzerland)", + "fr-FR": "French (France)", + "ga-IE": "Irish (Ireland)", + "gl-ES": "Galician (Spain)", + "gu-IN": "Gujarati (India)", + "he-IL": "Hebrew (Israel)", + "hi-IN": "Hindi (India)", + "hr-HR": "Croatian (Croatia)", + "hu-HU": "Hungarian (Hungary)", + "hy-AM": "Armenian (Armenia)", + "id-ID": "Indonesian (Indonesia)", + "is-IS": "Icelandic (Iceland)", + "it-IT": "Italian (Italy)", + "ja-JP": "Japanese (Japan)", + "jv-ID": "Javanese (Indonesia)", + "ka-GE": "Georgian (Georgia)", + "kk-KZ": "Kazakh (Kazakhstan)", + "km-KH": "Khmer (Cambodia)", + "kn-IN": "Kannada (India)", + "ko-KR": "Korean (South Korea)", + "lo-LA": "Lao (Laos)", + "lt-LT": "Lithuanian (Lithuania)", + "lv-LV": "Latvian (Latvia)", + "mk-MK": "Macedonian (Macedonia)", + "ml-IN": "Malayalam (India)", + "mn-MN": "Mongolian (Mongolia)", + "mr-IN": "Marathi (India)", + "ms-MY": "Malay (Malaysia)", + "mt-MT": "Maltese (Malta)", + "my-MM": "Burmese (Myanmar)", + "nb-NO": "Norwegian Bokmål (Norway)", + "ne-NP": "Nepali (Nepal)", + "nl-BE": "Dutch (Belgium)", + "nl-NL": "Dutch (Netherlands)", + "pa-IN": "Punjabi (India)", + "pl-PL": "Polish (Poland)", + "ps-AF": "Pashto (Afghanistan)", + "pt-BR": "Portuguese (Brazil)", + "pt-PT": "Portuguese (Portugal)", + "ro-RO": "Romanian (Romania)", + "ru-RU": "Russian (Russia)", + "si-LK": "Sinhala (Sri Lanka)", + "sk-SK": "Slovak (Slovakia)", + "sl-SI": "Slovenian (Slovenia)", + "so-SO": "Somali (Somalia)", + "sq-AL": "Albanian (Albania)", + "sr-RS": "Serbian (Serbia)", + "su-ID": "Sundanese (Indonesia)", + "sv-SE": "Swedish (Sweden)", + "sw-KE": "Swahili (Kenya)", + "sw-TZ": "Swahili (Tanzania)", + "ta-IN": "Tamil (India)", + "ta-LK": "Tamil (Sri Lanka)", + "ta-SG": "Tamil (Singapore)", + "te-IN": "Telugu (India)", + "th-TH": "Thai (Thailand)", + "tr-TR": "Turkish (Turkey)", + "uk-UA": "Ukrainian (Ukraine)", + "ur-PK": "Urdu (Pakistan)", + "uz-UZ": "Uzbek (Uzbekistan)", + "vi-VN": "Vietnamese (Vietnam)", + "yue-CN": "Cantonese (China)", + "yue-HK": "Cantonese (Hong Kong)", + "zh-CN": "Chinese (China)", + "zh-HK": "Chinese (Hong Kong)", + "zh-TW": "Chinese (Taiwan)", +}; + +const LANGUAGE_DISPLAY: Record = { + af: "Afrikaans", + am: "Amharic", + ar: "Arabic", + az: "Azerbaijani", + bg: "Bulgarian", + bn: "Bengali", + ca: "Catalan", + cs: "Czech", + cy: "Welsh", + da: "Danish", + de: "German", + el: "Greek", + en: "English", + es: "Spanish", + et: "Estonian", + eu: "Basque", + fa: "Persian", + fi: "Finnish", + fil: "Filipino", + fr: "French", + ga: "Irish", + gl: "Galician", + gu: "Gujarati", + he: "Hebrew", + hi: "Hindi", + hr: "Croatian", + hu: "Hungarian", + hy: "Armenian", + id: "Indonesian", + is: "Icelandic", + it: "Italian", + ja: "Japanese", + jv: "Javanese", + ka: "Georgian", + kk: "Kazakh", + km: "Khmer", + kn: "Kannada", + ko: "Korean", + lo: "Lao", + lt: "Lithuanian", + lv: "Latvian", + mk: "Macedonian", + ml: "Malayalam", + mn: "Mongolian", + mr: "Marathi", + ms: "Malay", + mt: "Maltese", + my: "Burmese", + nb: "Norwegian Bokmål", + ne: "Nepali", + nl: "Dutch", + pa: "Punjabi", + pl: "Polish", + ps: "Pashto", + pt: "Portuguese", + ro: "Romanian", + ru: "Russian", + si: "Sinhala", + sk: "Slovak", + sl: "Slovenian", + so: "Somali", + sq: "Albanian", + sr: "Serbian", + su: "Sundanese", + sv: "Swedish", + sw: "Swahili", + ta: "Tamil", + te: "Telugu", + th: "Thai", + tr: "Turkish", + uk: "Ukrainian", + ur: "Urdu", + uz: "Uzbek", + vi: "Vietnamese", + yue: "Cantonese", + zh: "Chinese", +}; + +export function toIso639_3(code: string): string { + const short = code.split("-")[0].toLowerCase(); + return ISO_639_1_TO_639_3[short] || short; +} + +export function toLanguageDisplay(bcp47: string): string { + if (BCP47_TO_DISPLAY[bcp47]) return BCP47_TO_DISPLAY[bcp47]; + const short = bcp47.split("-")[0].toLowerCase(); + return LANGUAGE_DISPLAY[short] || bcp47; +} + +export function normalizeBcp47(lang: string): string { + if (lang.includes("-")) return lang; + return `${lang}-US`; +} From fd69270ddc4b519ecb284cc61fdc792725329df5 Mon Sep 17 00:00:00 2001 From: will wade Date: Wed, 8 Apr 2026 21:41:59 +0100 Subject: [PATCH 5/7] feat: add model-level feature declarations across all engines - Add ModelFeature type: streaming, audio-tags, inline-voice-cloning, open-source, word-boundary-events, character-boundary-events, ssml - Add FEATURES constants and ModelInfo interface to types.ts - Add getModels(), hasFeature(), _getCurrentModelId() to AbstractTTSClient - Define accurate _models metadata for all 23 engines based on code audit: - Real word boundaries: elevenlabs, google (beta), polly, azure, watson, sherpaonnx - Character boundaries: elevenlabs (all models) - Native SSML: google, polly, azure, watson, witai, sapi - Audio tags: elevenlabs v3, openai gpt-4o-mini-tts, cartesia sonic-3, xai grok-tts, fish audio s2-pro - Voice cloning: elevenlabs v3, cartesia sonic-3, hume octave-2, fish audio s2-pro, mistral, resemble - Open source: sherpaonnx, sherpaonnx-wasm, espeak-ng, mistral, resemble --- src/core/abstract-tts.ts | 20 ++++++++++++++++++++ src/engines/azure.ts | 1 + src/engines/cartesia.ts | 4 ++++ src/engines/deepgram.ts | 5 +++++ src/engines/elevenlabs.ts | 25 +++++++++++++++++++++++++ src/engines/espeak.ts | 2 ++ src/engines/fishaudio.ts | 3 +++ src/engines/google.ts | 9 +++++++++ src/engines/hume.ts | 4 ++++ src/engines/mistral.ts | 6 ++++++ src/engines/modelslab.ts | 1 + src/engines/murf.ts | 4 ++++ src/engines/openai.ts | 5 +++++ src/engines/playht.ts | 5 +++++ src/engines/polly.ts | 7 +++++++ src/engines/resemble.ts | 3 +++ src/engines/sapi.ts | 2 ++ src/engines/sherpaonnx-wasm.ts | 2 ++ src/engines/sherpaonnx.ts | 2 ++ src/engines/unrealspeech.ts | 1 + src/engines/upliftai.ts | 1 + src/engines/watson.ts | 1 + src/engines/witai.ts | 2 ++ src/engines/xai.ts | 1 + src/types.ts | 24 ++++++++++++++++++++++++ 25 files changed, 140 insertions(+) diff --git a/src/core/abstract-tts.ts b/src/core/abstract-tts.ts index 809b7db..c3d6b59 100644 --- a/src/core/abstract-tts.ts +++ b/src/core/abstract-tts.ts @@ -2,6 +2,8 @@ import * as SpeechMarkdown from "../markdown/converter"; import { SSMLBuilder } from "../ssml/builder"; import type { CredentialsCheckResult, + ModelFeature, + ModelInfo, PropertyType, SimpleCallback, SpeakInput, @@ -67,6 +69,8 @@ export abstract class AbstractTTSClient { */ protected timings: Array<[number, number, string]> = []; + protected _models: ModelInfo[] = []; + /** * Capability signaling for UIs to filter providers without hardcoding names * Engines can override these in their constructors. @@ -1095,6 +1099,22 @@ export abstract class AbstractTTSClient { * Override in subclasses to provide engine-specific requirements * @returns Array of required credential field names */ + getModels(): ModelInfo[] { + return this._models; + } + + hasFeature(feature: ModelFeature, modelId?: string): boolean { + const target = modelId || this._getCurrentModelId(); + if (!target) return false; + const model = this._models.find((m) => m.id === target); + if (!model) return false; + return model.features.includes(feature); + } + + protected _getCurrentModelId(): string { + return (this as any).model || ""; + } + protected getRequiredCredentials(): string[] { return []; // Default: no credentials required } diff --git a/src/engines/azure.ts b/src/engines/azure.ts index 59fe6ed..4884c21 100644 --- a/src/engines/azure.ts +++ b/src/engines/azure.ts @@ -28,6 +28,7 @@ export class AzureTTSClient extends AbstractTTSClient { */ constructor(credentials: AzureTTSCredentials) { super(credentials); + this._models = [{ id: "azure", features: ["streaming", "ssml", "word-boundary-events"] }]; // Type assertion is safe here due to the AzureTTSCredentials interface this.subscriptionKey = credentials.subscriptionKey as string; this.region = credentials.region as string; diff --git a/src/engines/cartesia.ts b/src/engines/cartesia.ts index 8fab400..5268cc3 100644 --- a/src/engines/cartesia.ts +++ b/src/engines/cartesia.ts @@ -108,6 +108,10 @@ export class CartesiaTTSClient extends AbstractTTSClient { encoding: "pcm_f32le", sample_rate: 44100, }; + this._models = [ + { id: "sonic-3", features: ["streaming", "audio-tags", "inline-voice-cloning"] }, + { id: "sonic-2", features: ["streaming"] }, + ]; this.sampleRate = 44100; this.applyCredentialProperties(credentials); diff --git a/src/engines/deepgram.ts b/src/engines/deepgram.ts index 94f1178..a0b218d 100644 --- a/src/engines/deepgram.ts +++ b/src/engines/deepgram.ts @@ -71,6 +71,11 @@ export class DeepgramTTSClient extends AbstractTTSClient { this.model = (credentials as any).model || "aura-2"; this.voiceId = "aura-2-apollo-en"; + this._models = [ + { id: "aura-2", features: ["streaming"] }, + { id: "aura", features: ["streaming"] }, + ]; + this.applyCredentialProperties(credentials); } diff --git a/src/engines/elevenlabs.ts b/src/engines/elevenlabs.ts index b4e826a..9fc23aa 100644 --- a/src/engines/elevenlabs.ts +++ b/src/engines/elevenlabs.ts @@ -97,6 +97,31 @@ export class ElevenLabsTTSClient extends AbstractTTSClient { */ constructor(credentials: ElevenLabsCredentials = {}) { super(credentials); + this._models = [ + { + id: "eleven_v3", + features: [ + "audio-tags", + "inline-voice-cloning", + "word-boundary-events", + "character-boundary-events", + ], + }, + { id: "eleven_turbo_v2_5", features: ["word-boundary-events", "character-boundary-events"] }, + { id: "eleven_turbo_v2", features: ["word-boundary-events", "character-boundary-events"] }, + { + id: "eleven_monolingual_v1", + features: ["word-boundary-events", "character-boundary-events"], + }, + { + id: "eleven_multilingual_v1", + features: ["word-boundary-events", "character-boundary-events"], + }, + { + id: "eleven_multilingual_v2", + features: ["word-boundary-events", "character-boundary-events"], + }, + ]; this.apiKey = credentials.apiKey || process.env.ELEVENLABS_API_KEY || ""; this.modelId = (credentials as any).modelId || (credentials as any).model || "eleven_multilingual_v2"; diff --git a/src/engines/espeak.ts b/src/engines/espeak.ts index 362a421..4d1160d 100644 --- a/src/engines/espeak.ts +++ b/src/engines/espeak.ts @@ -115,6 +115,8 @@ export class EspeakNodeTTSClient extends AbstractTTSClient { constructor(credentials: TTSCredentials = {}) { super(credentials); + this._models = [{ id: "espeak-ng", features: ["open-source"] }]; + // Set a default voice for eSpeak TTS this.voiceId = "en"; // Default English voice } diff --git a/src/engines/fishaudio.ts b/src/engines/fishaudio.ts index c20e9af..04c40b4 100644 --- a/src/engines/fishaudio.ts +++ b/src/engines/fishaudio.ts @@ -36,6 +36,9 @@ export class FishAudioTTSClient extends AbstractTTSClient { this.baseUrl = credentials.baseURL || "https://api.fish.audio"; this.model = (credentials as any).model || "s2-pro"; this.voiceId = ""; + this._models = [ + { id: "s2-pro", features: ["streaming", "audio-tags", "inline-voice-cloning"] }, + ]; this.sampleRate = 44100; this.applyCredentialProperties(credentials); diff --git a/src/engines/google.ts b/src/engines/google.ts index 3398532..920744a 100644 --- a/src/engines/google.ts +++ b/src/engines/google.ts @@ -65,6 +65,15 @@ export class GoogleTTSClient extends AbstractTTSClient { constructor(credentials: GoogleTTSCredentials) { super(credentials); + this._models = [ + { id: "studio", features: ["streaming", "ssml", "word-boundary-events"] }, + { id: "standard", features: ["streaming", "ssml", "word-boundary-events"] }, + { id: "waveNet", features: ["streaming", "ssml", "word-boundary-events"] }, + { id: "journey", features: ["streaming", "ssml"] }, + { id: "neural2", features: ["streaming", "ssml", "word-boundary-events"] }, + { id: "polyglot", features: ["streaming", "ssml"] }, + ]; + // Store the credentials for later use this.googleCredentials = credentials; this.client = null; diff --git a/src/engines/hume.ts b/src/engines/hume.ts index 1dee088..baf7368 100644 --- a/src/engines/hume.ts +++ b/src/engines/hume.ts @@ -51,6 +51,10 @@ export class HumeTTSClient extends AbstractTTSClient { this.baseUrl = credentials.baseURL || "https://api.hume.ai/v0"; this.model = (credentials as any).model || "octave-2"; this.voiceId = "aac4caff-e2e1-4088-9d58-a29c5d22dce6"; + this._models = [ + { id: "octave-2", features: ["streaming", "inline-voice-cloning"] }, + { id: "octave-1", features: ["streaming"] }, + ]; this.sampleRate = 24000; this.applyCredentialProperties(credentials); diff --git a/src/engines/mistral.ts b/src/engines/mistral.ts index 4ec0ba9..0f7d6f6 100644 --- a/src/engines/mistral.ts +++ b/src/engines/mistral.ts @@ -66,6 +66,12 @@ export class MistralTTSClient extends AbstractTTSClient { this.model = (credentials as any).model || "voxtral-mini-tts-2603"; this.voiceId = ""; this.responseFormat = "mp3"; + this._models = [ + { + id: "voxtral-mini-tts-2603", + features: ["streaming", "inline-voice-cloning", "open-source"], + }, + ]; this.sampleRate = 24000; this.applyCredentialProperties(credentials); diff --git a/src/engines/modelslab.ts b/src/engines/modelslab.ts index 78a17f3..b76dc3c 100644 --- a/src/engines/modelslab.ts +++ b/src/engines/modelslab.ts @@ -119,6 +119,7 @@ export class ModelsLabTTSClient extends AbstractTTSClient { constructor(credentials: ModelsLabTTSCredentials = {}) { super(credentials); + this._models = [{ id: "modelslab", features: [] }]; this.apiKey = credentials.apiKey || (typeof process !== "undefined" ? (process.env.MODELSLAB_API_KEY ?? "") : ""); diff --git a/src/engines/murf.ts b/src/engines/murf.ts index 473ed62..23a378c 100644 --- a/src/engines/murf.ts +++ b/src/engines/murf.ts @@ -51,6 +51,10 @@ export class MurfTTSClient extends AbstractTTSClient { this.baseUrl = credentials.baseURL || "https://api.murf.ai/v1"; this.model = (credentials as any).model || "GEN2"; this.voiceId = "en-US-natalie"; + this._models = [ + { id: "GEN2", features: ["streaming"] }, + { id: "FALCON", features: ["streaming"] }, + ]; this.sampleRate = 24000; this.applyCredentialProperties(credentials); diff --git a/src/engines/openai.ts b/src/engines/openai.ts index f7ca703..961b0db 100644 --- a/src/engines/openai.ts +++ b/src/engines/openai.ts @@ -140,6 +140,11 @@ export class OpenAITTSClient extends AbstractTTSClient { */ constructor(credentials: OpenAITTSCredentials = {}) { super(credentials); + this._models = [ + { id: "gpt-4o-mini-tts", features: ["streaming", "audio-tags"] }, + { id: "tts-1", features: ["streaming"] }, + { id: "tts-1-hd", features: ["streaming"] }, + ]; this.credentials = credentials; // Don't initialize client here, load it on demand diff --git a/src/engines/playht.ts b/src/engines/playht.ts index 52222a6..10697c0 100644 --- a/src/engines/playht.ts +++ b/src/engines/playht.ts @@ -77,6 +77,11 @@ export class PlayHTTTSClient extends AbstractTTSClient { constructor(credentials: PlayHTTTSCredentials = {}) { super(credentials); + this._models = [ + { id: "playht", features: ["streaming"] }, + { id: "playht2.5-turbo", features: ["streaming"] }, + ]; + // Set credentials this.apiKey = credentials.apiKey || process.env.PLAYHT_API_KEY || ""; this.userId = credentials.userId || process.env.PLAYHT_USER_ID || ""; diff --git a/src/engines/polly.ts b/src/engines/polly.ts index 0bc8cbb..7905759 100644 --- a/src/engines/polly.ts +++ b/src/engines/polly.ts @@ -58,6 +58,13 @@ export class PollyTTSClient extends AbstractTTSClient { constructor(credentials: PollyTTSCredentials) { super(credentials); + this._models = [ + { id: "generative", features: ["streaming", "ssml", "word-boundary-events"] }, + { id: "long-form", features: ["streaming", "ssml", "word-boundary-events"] }, + { id: "neural", features: ["streaming", "ssml", "word-boundary-events"] }, + { id: "standard", features: ["streaming", "ssml", "word-boundary-events"] }, + ]; + // Set the default sample rate for PCM format to match the Python implementation // The Python implementation uses wav.setparams((1, 2, 16000, 0, "NONE", "NONE")) this.sampleRate = 16000; // Default sample rate for Polly PCM format diff --git a/src/engines/resemble.ts b/src/engines/resemble.ts index 09b40b4..c2def31 100644 --- a/src/engines/resemble.ts +++ b/src/engines/resemble.ts @@ -29,6 +29,9 @@ export class ResembleTTSClient extends AbstractTTSClient { this.apiKey = credentials.apiKey || process.env.RESEMBLE_API_KEY || ""; this.baseUrl = credentials.baseURL || "https://f.cluster.resemble.ai"; this.voiceId = ""; + this._models = [ + { id: "default", features: ["streaming", "inline-voice-cloning", "open-source"] }, + ]; this.sampleRate = 22050; this.applyCredentialProperties(credentials); diff --git a/src/engines/sapi.ts b/src/engines/sapi.ts index 6af203a..b77710a 100644 --- a/src/engines/sapi.ts +++ b/src/engines/sapi.ts @@ -42,6 +42,8 @@ export class SAPITTSClient extends AbstractTTSClient { constructor(credentials: SAPITTSCredentials = {}) { super(credentials); + this._models = [{ id: "sapi", features: ["ssml"] }]; + // Validate Windows environment this.validateEnvironment(); diff --git a/src/engines/sherpaonnx-wasm.ts b/src/engines/sherpaonnx-wasm.ts index f8660fe..ef0141e 100644 --- a/src/engines/sherpaonnx-wasm.ts +++ b/src/engines/sherpaonnx-wasm.ts @@ -167,6 +167,8 @@ export class SherpaOnnxWasmTTSClient extends AbstractTTSClient { constructor(credentials: TTSCredentials = {}, enhancedOptions: EnhancedWasmOptions = {}) { super(credentials); + this._models = [{ id: "sherpaonnx-wasm", features: ["open-source"] }]; + // Capabilities: Browser-only engine, requires WASM runtime this.capabilities = { browserSupported: true, nodeSupported: false, needsWasm: true }; diff --git a/src/engines/sherpaonnx.ts b/src/engines/sherpaonnx.ts index c8296ec..a4ccbd2 100644 --- a/src/engines/sherpaonnx.ts +++ b/src/engines/sherpaonnx.ts @@ -163,6 +163,8 @@ export class SherpaOnnxTTSClient extends AbstractTTSClient { constructor(credentials: SherpaOnnxTTSCredentials) { super(credentials); + this._models = [{ id: "sherpaonnx", features: ["open-source", "word-boundary-events"] }]; + // Initialize instance variables with proper null/undefined checking this.modelPath = credentials?.modelPath || null; this.modelId = credentials?.modelId || null; diff --git a/src/engines/unrealspeech.ts b/src/engines/unrealspeech.ts index e591a0a..bd9ee64 100644 --- a/src/engines/unrealspeech.ts +++ b/src/engines/unrealspeech.ts @@ -40,6 +40,7 @@ export class UnrealSpeechTTSClient extends AbstractTTSClient { this.apiKey = credentials.apiKey || process.env.UNREAL_SPEECH_API_KEY || ""; this.baseUrl = credentials.baseURL || "https://api.v8.unrealspeech.com"; this.voiceId = "Sierra"; + this._models = [{ id: "default", features: ["streaming"] }]; this.sampleRate = 24000; this.applyCredentialProperties(credentials); diff --git a/src/engines/upliftai.ts b/src/engines/upliftai.ts index 935d1ec..57246c3 100644 --- a/src/engines/upliftai.ts +++ b/src/engines/upliftai.ts @@ -93,6 +93,7 @@ export class UpliftAITTSClient extends AbstractTTSClient { */ constructor(credentials: UpliftAITTSCredentials = {}) { super(credentials); + this._models = [{ id: "upliftai", features: [] }]; this.apiKey = credentials.apiKey || process.env.UPLIFTAI_API_KEY || ""; this.outputFormat = "MP3_22050_128"; // Default format } diff --git a/src/engines/watson.ts b/src/engines/watson.ts index 1d783c9..cad7cc4 100644 --- a/src/engines/watson.ts +++ b/src/engines/watson.ts @@ -31,6 +31,7 @@ export class WatsonTTSClient extends AbstractTTSClient { */ constructor(credentials: WatsonTTSCredentials) { super(credentials); + this._models = [{ id: "watson", features: ["streaming", "ssml", "word-boundary-events"] }]; this.apiKey = credentials.apiKey as string; this.region = credentials.region as string; this.instanceId = credentials.instanceId as string; diff --git a/src/engines/witai.ts b/src/engines/witai.ts index bb13b8b..a67ded3 100644 --- a/src/engines/witai.ts +++ b/src/engines/witai.ts @@ -27,6 +27,8 @@ export class WitAITTSClient extends AbstractTTSClient { constructor(credentials: WitAITTSCredentials) { super(credentials); + this._models = [{ id: "witai", features: ["streaming", "ssml"] }]; + if (!credentials.token) { throw new Error("An API token for Wit.ai must be provided"); } diff --git a/src/engines/xai.ts b/src/engines/xai.ts index b3f1f12..a1c7369 100644 --- a/src/engines/xai.ts +++ b/src/engines/xai.ts @@ -44,6 +44,7 @@ export class XaiTTSClient extends AbstractTTSClient { this.model = (credentials as any).model || "grok-tts"; this.voiceId = "avalon-47"; this.language = "auto"; + this._models = [{ id: "grok-tts", features: ["streaming", "audio-tags"] }]; this.sampleRate = 24000; this.applyCredentialProperties(credentials); diff --git a/src/types.ts b/src/types.ts index 78d5eb3..8c78d66 100644 --- a/src/types.ts +++ b/src/types.ts @@ -189,6 +189,30 @@ export type SimpleCallback = () => void; /** * Property type for TTS properties */ +export type ModelFeature = + | "streaming" + | "audio-tags" + | "inline-voice-cloning" + | "open-source" + | "word-boundary-events" + | "character-boundary-events" + | "ssml"; + +export const FEATURES: Record = { + STREAMING: "streaming", + AUDIO_TAGS: "audio-tags", + INLINE_VOICE_CLONING: "inline-voice-cloning", + OPEN_SOURCE: "open-source", + WORD_BOUNDARY_EVENTS: "word-boundary-events", + CHARACTER_BOUNDARY_EVENTS: "character-boundary-events", + SSML: "ssml", +} as const; + +export interface ModelInfo { + id: string; + features: ModelFeature[]; +} + export type PropertyType = string | number | boolean; /** From d34ceda1e4aa6bc78136a533194c7f927460150e Mon Sep 17 00:00:00 2001 From: will wade Date: Thu, 9 Apr 2026 03:35:33 +0100 Subject: [PATCH 6/7] fix: ElevenLabs true streaming in synthToBytestream - Pipe response.body directly when not using timestamps (avoids buffering) - Only buffer when useTimestamps=true (needs JSON response for alignment) - Only buffer when format=wav (needs mp3-to-wav conversion) - Add streaming feature to all ElevenLabs models - Update BACKLOG: mark streaming improvements as done for Cartesia, Deepgram, ElevenLabs, Polly; note Google Cloud TTS SDK limitation --- BACKLOG.md | 11 +++-- src/engines/elevenlabs.ts | 87 +++++++++++++++++++++++++++------------ 2 files changed, 68 insertions(+), 30 deletions(-) diff --git a/BACKLOG.md b/BACKLOG.md index 08192df..986174b 100644 --- a/BACKLOG.md +++ b/BACKLOG.md @@ -67,10 +67,13 @@ Providers that support inline voice cloning: ### Streaming Improvements -- Cartesia: true streaming via WebSocket or SSE -- Deepgram: true streaming -- Google Gemini: pseudo-streaming (SSE base64 chunks) -- Standardize `synthToBytestream` to return actual streaming responses where supported +- [x] Cartesia: true streaming (already pipes response.body) +- [x] Deepgram: true streaming (already pipes response.body) +- [x] ElevenLabs: true streaming (fixed — pipes response.body when not using timestamps) +- [x] Polly: true streaming for MP3/OGG (already pipes AudioStream; WAV requires buffering for header) +- [x] Standardize `synthToBytestream` to return actual streaming responses where supported +- Google Cloud TTS: SDK returns all audio at once — would need StreamingSynthesize beta API +- Google Gemini TTS: pseudo-streaming via SSE base64 chunks (new engine, not yet implemented) ### Tree-Shakeable Subpath Exports diff --git a/src/engines/elevenlabs.ts b/src/engines/elevenlabs.ts index 9fc23aa..09a2bf6 100644 --- a/src/engines/elevenlabs.ts +++ b/src/engines/elevenlabs.ts @@ -101,25 +101,32 @@ export class ElevenLabsTTSClient extends AbstractTTSClient { { id: "eleven_v3", features: [ + "streaming", "audio-tags", "inline-voice-cloning", "word-boundary-events", "character-boundary-events", ], }, - { id: "eleven_turbo_v2_5", features: ["word-boundary-events", "character-boundary-events"] }, - { id: "eleven_turbo_v2", features: ["word-boundary-events", "character-boundary-events"] }, + { + id: "eleven_turbo_v2_5", + features: ["streaming", "word-boundary-events", "character-boundary-events"], + }, + { + id: "eleven_turbo_v2", + features: ["streaming", "word-boundary-events", "character-boundary-events"], + }, { id: "eleven_monolingual_v1", - features: ["word-boundary-events", "character-boundary-events"], + features: ["streaming", "word-boundary-events", "character-boundary-events"], }, { id: "eleven_multilingual_v1", - features: ["word-boundary-events", "character-boundary-events"], + features: ["streaming", "word-boundary-events", "character-boundary-events"], }, { id: "eleven_multilingual_v2", - features: ["word-boundary-events", "character-boundary-events"], + features: ["streaming", "word-boundary-events", "character-boundary-events"], }, ]; this.apiKey = credentials.apiKey || process.env.ELEVENLABS_API_KEY || ""; @@ -627,26 +634,34 @@ export class ElevenLabsTTSClient extends AbstractTTSClient { // Check if we need timing data const useTimestamps = options?.useTimestamps || options?.useWordBoundary; - let audioData: Uint8Array; + let audioStream: ReadableStream; let wordBoundaries: Array<{ text: string; offset: number; duration: number }> = []; if (useTimestamps) { - // Use the with-timestamps endpoint for timing data const timestampResponse = await this.synthWithTimestamps(preparedText, voiceId, options); - // Decode base64 audio data const audioBase64 = timestampResponse.audio_base64; - audioData = base64ToUint8Array(audioBase64); + const audioData = base64ToUint8Array(audioBase64); - // Convert character timing to word boundaries if (timestampResponse.alignment) { wordBoundaries = this.convertCharacterTimingToWordBoundaries( preparedText, timestampResponse.alignment ); } + + let finalData = audioData; + if (options?.format === "wav") { + finalData = await this.convertMp3ToWav(audioData); + } + + audioStream = new ReadableStream({ + start(controller) { + controller.enqueue(finalData); + controller.close(); + }, + }); } else { - // Use the regular streaming endpoint (no timing data) const payload = this.buildRequestPayload(preparedText, options); const requestOptions = { method: "POST", @@ -674,24 +689,44 @@ export class ElevenLabsTTSClient extends AbstractTTSClient { throw err; } - const responseArrayBuffer = await response.arrayBuffer(); - audioData = new Uint8Array(responseArrayBuffer); - } + if (response.body) { + audioStream = response.body; + } else { + const arrayBuffer = await response.arrayBuffer(); + audioStream = new ReadableStream({ + start(controller) { + controller.enqueue(new Uint8Array(arrayBuffer)); + controller.close(); + }, + }); + } - // Convert to WAV if requested (since we always get MP3 from ElevenLabs) - if (options?.format === "wav") { - audioData = await this.convertMp3ToWav(audioData); + if (options?.format === "wav") { + const chunks: Uint8Array[] = []; + const reader = audioStream.getReader(); + while (true) { + const { done, value } = await reader.read(); + if (done) break; + chunks.push(value); + } + const totalLength = chunks.reduce((acc, c) => acc + c.length, 0); + const merged = new Uint8Array(totalLength); + let offset = 0; + for (const chunk of chunks) { + merged.set(chunk, offset); + offset += chunk.length; + } + const wavData = await this.convertMp3ToWav(merged); + audioStream = new ReadableStream({ + start(controller) { + controller.enqueue(wavData); + controller.close(); + }, + }); + } } - // Create a ReadableStream from the Uint8Array - const readableStream = new ReadableStream({ - start(controller) { - controller.enqueue(audioData); - controller.close(); - }, - }); - - return { audioStream: readableStream, wordBoundaries }; + return { audioStream, wordBoundaries }; } catch (error) { console.error("Error synthesizing speech stream:", error); throw error; From ef9d6b85d0a25039eebaa1001045a0b5f04ffdab Mon Sep 17 00:00:00 2001 From: Owen McGirr Date: Sat, 11 Apr 2026 10:01:50 +0100 Subject: [PATCH 7/7] refactor(elevenlabs): remove redundant V3_AUDIO_TAG_MODELS, reuse MODEL_V3 --- src/engines/elevenlabs.ts | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/engines/elevenlabs.ts b/src/engines/elevenlabs.ts index 06cdfb9..36ec6bb 100644 --- a/src/engines/elevenlabs.ts +++ b/src/engines/elevenlabs.ts @@ -486,8 +486,6 @@ export class ElevenLabsTTSClient extends AbstractTTSClient { private static readonly AUDIO_TAG_REGEX = /\[[^\]]+\]/g; - private static readonly V3_AUDIO_TAG_MODELS = ["eleven_v3"]; - /** * Prepare text for synthesis by stripping SSML tags. * ElevenLabs does not support SSML — use native [audio tags] for v3 expressiveness. @@ -521,9 +519,7 @@ export class ElevenLabsTTSClient extends AbstractTTSClient { */ private processAudioTags(text: string, options?: ElevenLabsTTSOptions): string { const modelId = this.resolveModelId(options); - const isAudioTagModel = ElevenLabsTTSClient.V3_AUDIO_TAG_MODELS.some((m) => - modelId.startsWith(m) - ); + const isAudioTagModel = modelId.startsWith(ElevenLabsTTSClient.MODEL_V3); if (isAudioTagModel) { return text;