diff --git a/api-reference/endpoint/openapi-v1/tts-with-timestamp.mdx b/api-reference/endpoint/openapi-v1/tts-with-timestamp.mdx new file mode 100644 index 0000000..a02eea9 --- /dev/null +++ b/api-reference/endpoint/openapi-v1/tts-with-timestamp.mdx @@ -0,0 +1,49 @@ +--- +openapi: post /v1/tts/with_timestamp +title: 'Text to Speech with Timestamp' +description: 'Generate speech with word-level timestamp alignment' +icon: "clock" +iconType: "solid" +--- + + +This endpoint generates complete audio first, then aligns it with the input text to provide precise timing information for each segment. The response includes both the audio and an array of timestamp segments. + + +## Response Format + +The response is a JSON object containing: + +| Field | Type | Description | +|-------|------|-------------| +| `audio_base64` | string | Base64-encoded audio data | +| `text` | string | The synthesized text (with emotion markers removed) | +| `alignment` | array | Array of timestamp segments | + +Each timestamp segment contains: + +| Field | Type | Description | +|-------|------|-------------| +| `text` | string | The text content of this segment | +| `start` | number | Start time in seconds | +| `end` | number | End time in seconds | + +## Example Response + +```json +{ + "audio_base64": "UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8...", + "text": "Hello, world!", + "alignment": [ + {"text": "Hello,", "start": 0.0, "end": 0.45}, + {"text": "world!", "start": 0.52, "end": 1.1} + ] +} +``` + +## Use Cases + +- **Subtitle generation**: Automatically create synchronized subtitles for video content +- **Karaoke-style highlighting**: Highlight words as they are spoken +- **Accessibility features**: Provide visual indicators synchronized with audio playback +- **Audio editing**: Precisely locate and edit specific words in generated speech \ No newline at end of file diff --git a/api-reference/openapi.json b/api-reference/openapi.json index b9eeac7..64a29fe 100644 --- a/api-reference/openapi.json +++ b/api-reference/openapi.json @@ -43,23 +43,23 @@ "schema": { "properties": { "text": { - "description": "Text to be converted to speech", + "description": "The text content to convert to speech. Supports multiple languages including English, Chinese, Japanese, and more. For best results, use proper punctuation and avoid mixing too many languages in a single request.", "title": "Text", "type": "string" }, "temperature": { - "description": "Controls randomness in the speech generation. Higher values (e.g., 1.0) make the output more random, while lower values (e.g., 0.1) make it more deterministic. We recommend `0.9` for `s1` model", + "description": "Controls the randomness/expressiveness of the generated speech. Higher values produce more varied and expressive speech, lower values produce more consistent output.", "title": "Temperature", "type": "number", - "default": 0.9, + "default": 0.7, "minimum": 0.0, "maximum": 1.0 }, "top_p": { - "description": "Controls diversity via nucleus sampling. Lower values (e.g., 0.1) make the output more focused, while higher values (e.g., 1.0) allow more diversity. We recommend `0.9` for `s1` model", + "description": "Controls diversity via nucleus sampling. Lower values produce more focused/predictable speech, higher values allow more variety.", "title": "Top P", "type": "number", - "default": 0.9, + "default": 0.7, "minimum": 0.0, "maximum": 1.0 }, @@ -75,7 +75,7 @@ "type": "null" } ], - "description": "References to be used for the speech, this requires MessagePack serialization, this will override reference_voices and reference_texts", + "description": "Custom voice references for zero-shot voice cloning. Provide audio samples with their transcripts to clone a voice on-the-fly without creating a model. Requires MessagePack serialization (not available via JSON). Takes precedence over reference_id if both are provided. For best results, use 10-30 seconds of clean audio.", "title": "References" }, "reference_id": { @@ -88,7 +88,7 @@ } ], "default": null, - "description": "ID of the reference model o be used for the speech", + "description": "The unique identifier of a voice model to use for speech synthesis. You can find model IDs in the Fish Audio voice library or use your own custom-trained models. If not specified and no references provided, a default voice will be used.", "title": "Reference Id" }, "prosody": { @@ -101,11 +101,11 @@ } ], "default": null, - "description": "Prosody to be used for the speech" + "description": "Fine-tune the speech output with speed and volume adjustments. Speed controls speaking rate (1.0 = normal, <1.0 = slower, >1.0 = faster). Volume adjusts loudness in decibels (0 = unchanged, positive = louder, negative = quieter)." }, "chunk_length": { - "default": 200, - "description": "Chunk length to be used for the speech", + "default": 300, + "description": "Controls the size of text segments for processing. The text is split into chunks before synthesis. Higher values produce longer continuous speech segments but increase memory usage and latency. Lower values reduce latency but may affect naturalness at chunk boundaries.", "maximum": 300, "minimum": 100, "title": "Chunk Length", @@ -113,13 +113,13 @@ }, "normalize": { "default": true, - "description": "Whether to normalize the speech, this will reduce the latency but may reduce performance on numbers and dates", + "description": "Normalizes text for English and Chinese, improving stability for numbers. When enabled, preprocesses text to improve pronunciation consistency.", "title": "Normalize", "type": "boolean" }, "format": { "default": "mp3", - "description": "Format to be used for the speech", + "description": "Output audio format. 'mp3': Compressed, widely compatible, good for streaming (44.1kHz). 'wav': Uncompressed, highest quality, larger file size (44.1kHz). 'pcm': Raw 16-bit audio data, useful for real-time processing (44.1kHz). 'opus': Modern compressed format, excellent quality at low bitrates, ideal for web applications (48kHz).", "enum": [ "wav", "pcm", @@ -139,12 +139,12 @@ } ], "default": null, - "description": "Sample rate to be used for the speech", + "description": "Audio sample rate in Hz. When null, uses the format's default (44100 Hz for most formats). Common values: 44100 (CD quality), 22050 (reduced size), 16000 (speech-optimized). Lower sample rates reduce file size but may affect audio quality.", "title": "Sample Rate" }, "mp3_bitrate": { "default": 128, - "description": "MP3 Bitrate to be used for the speech", + "description": "Bitrate for MP3 output in kbps. Only applies when format='mp3'. 64: Smaller files, reduced quality. 128: Good balance of quality and size (recommended). 192: Higher quality, larger files.", "enum": [ 64, 128, @@ -154,8 +154,8 @@ "type": "integer" }, "opus_bitrate": { - "default": 32, - "description": "Opus Bitrate to be used for the speech", + "default": -1000, + "description": "Bitrate for Opus output in kbps. Only applies when format='opus'. -1000: Automatic bitrate selection based on content (recommended). 24-64: Manual bitrate selection. Higher values improve quality but increase file size.", "enum": [ -1000, 24, @@ -168,13 +168,48 @@ }, "latency": { "default": "normal", - "description": "Latency to be used for the speech, balanced will reduce the latency but may lead to performance degradation", + "description": "Controls the trade-off between response time and audio quality. 'normal': Best quality, standard latency - recommended for non-real-time applications. 'balanced': Reduced time-to-first-byte at the cost of slightly reduced quality - recommended for real-time/streaming applications. 'low': Lowest latency, may have further quality trade-offs.", "enum": [ + "low", "normal", "balanced" ], "title": "Latency", "type": "string" + }, + "max_new_tokens": { + "default": 1024, + "description": "Maximum number of audio tokens to generate per text chunk. Higher values allow generating longer audio segments per chunk. Reduce if experiencing memory issues or if you only need short utterances. Most users should leave this at the default.", + "title": "Max New Tokens", + "type": "integer" + }, + "repetition_penalty": { + "default": 1.2, + "description": "Discourages the model from repeating similar audio patterns. Values > 1.0 reduce repetition. Higher values more aggressively prevent loops but may affect naturalness.", + "title": "Repetition Penalty", + "type": "number" + }, + "min_chunk_length": { + "default": 50, + "description": "Minimum number of characters required before splitting text into a new chunk. Prevents creation of very short audio segments that might sound unnatural. Higher values ensure longer continuous segments but may increase latency for the first audio. Works in conjunction with chunk_length.", + "minimum": 0, + "maximum": 100, + "title": "Min Chunk Length", + "type": "integer" + }, + "condition_on_previous_chunks": { + "default": true, + "description": "Enables voice consistency across multiple text chunks. When true, uses the audio from previous chunks as context for generating subsequent chunks, maintaining consistent voice characteristics throughout long texts. When false, each chunk is generated independently. Disable only if you want deliberately varied intonation between segments.", + "title": "Condition On Previous Chunks", + "type": "boolean" + }, + "early_stop_threshold": { + "default": 1.0, + "description": "Controls when to stop audio generation early during batch processing. Value of 1.0 means wait for all samples to complete (best quality). Lower values (e.g., 0.8) stop when 80% of samples are done, potentially reducing latency but may cut off audio prematurely. Most users should leave this at 1.0.", + "title": "Early Stop Threshold", + "type": "number", + "minimum": 0.0, + "maximum": 1.0 } }, "required": [ @@ -192,7 +227,7 @@ "type": "string" }, "temperature": { - "description": "Controls randomness in the speech generation. Higher values (e.g., 1.0) make the output more random, while lower values (e.g., 0.1) make it more deterministic", + "description": "Controls the randomness/expressiveness of the generated speech. Higher values produce more varied and expressive speech, lower values produce more consistent output.", "title": "Temperature", "type": "number", "default": 0.7, @@ -200,7 +235,7 @@ "maximum": 1.0 }, "top_p": { - "description": "Controls diversity via nucleus sampling. Lower values (e.g., 0.1) make the output more focused, while higher values (e.g., 1.0) allow more diversity", + "description": "Controls diversity via nucleus sampling. Lower values produce more focused/predictable speech, higher values allow more variety.", "title": "Top P", "type": "number", "default": 0.7, @@ -232,7 +267,7 @@ } ], "default": null, - "description": "ID of the reference model o be used for the speech", + "description": "The unique identifier of a voice model to use for speech synthesis. You can find model IDs in the Fish Audio voice library or use your own custom-trained models. If not specified and no references provided, a default voice will be used.", "title": "Reference Id" }, "prosody": { @@ -248,7 +283,7 @@ "description": "Prosody to be used for the speech" }, "chunk_length": { - "default": 200, + "default": 300, "description": "Chunk length to be used for the speech", "maximum": 300, "minimum": 100, @@ -257,7 +292,7 @@ }, "normalize": { "default": true, - "description": "Whether to normalize the speech, this will reduce the latency but may reduce performance on numbers and dates", + "description": "Normalizes text for English and Chinese, improving stability for numbers. When enabled, preprocesses text to improve pronunciation consistency.", "title": "Normalize", "type": "boolean" }, @@ -298,7 +333,7 @@ "type": "integer" }, "opus_bitrate": { - "default": 32, + "default": -1000, "description": "Opus Bitrate to be used for the speech", "enum": [ -1000, @@ -314,11 +349,46 @@ "default": "normal", "description": "Latency to be used for the speech, balanced will reduce the latency but may lead to performance degradation", "enum": [ + "low", "normal", "balanced" ], "title": "Latency", "type": "string" + }, + "max_new_tokens": { + "default": 1024, + "description": "Maximum number of tokens to generate per chunk. Each chunk can produce up to ~25 seconds of audio", + "title": "Max New Tokens", + "type": "integer" + }, + "repetition_penalty": { + "default": 1.2, + "description": "Discourages the model from repeating similar audio patterns. Values > 1.0 reduce repetition. Higher values more aggressively prevent loops but may affect naturalness.", + "title": "Repetition Penalty", + "type": "number" + }, + "min_chunk_length": { + "default": 50, + "description": "Minimum chunk length for text splitting", + "minimum": 0, + "maximum": 100, + "title": "Min Chunk Length", + "type": "integer" + }, + "condition_on_previous_chunks": { + "default": true, + "description": "Whether to condition generation on previous chunks. If references are provided, uses references only; otherwise uses first chunk as reference", + "title": "Condition On Previous Chunks", + "type": "boolean" + }, + "early_stop_threshold": { + "default": 1.0, + "description": "Early stop if this fraction of samples are finished", + "title": "Early Stop Threshold", + "type": "number", + "minimum": 0.0, + "maximum": 1.0 } }, "required": [ @@ -682,6 +752,476 @@ ] } }, + "/v1/tts/with_timestamp": { + "post": { + "summary": "Text to Speech with Timestamp Alignment", + "description": "Generates speech from text and returns word-level timestamp alignment. The complete audio is generated first, then aligned with the text to provide precise timing information for each segment.", + "security": [ + { + "BearerAuth": [] + } + ], + "parameters": [ + { + "in": "header", + "name": "model", + "description": "Specify which TTS model to use. We recommend `s1`", + "required": true, + "schema": { + "type": "string", + "default": "s1", + "enum": [ + "s1", + "speech-1.6", + "speech-1.5" + ] + } + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "properties": { + "text": { + "description": "The text content to convert to speech.", + "title": "Text", + "type": "string" + }, + "temperature": { + "description": "Controls the randomness/expressiveness of the generated speech.", + "title": "Temperature", + "type": "number", + "default": 0.7, + "minimum": 0.0, + "maximum": 1.0 + }, + "top_p": { + "description": "Controls diversity via nucleus sampling.", + "title": "Top P", + "type": "number", + "default": 0.7, + "minimum": 0.0, + "maximum": 1.0 + }, + "reference_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The unique identifier of a voice model to use for speech synthesis.", + "title": "Reference Id" + }, + "chunk_length": { + "default": 300, + "description": "Controls the size of text segments for processing.", + "maximum": 300, + "minimum": 100, + "title": "Chunk Length", + "type": "integer" + }, + "normalize": { + "default": true, + "description": "Normalizes text for English and Chinese, improving stability for numbers.", + "title": "Normalize", + "type": "boolean" + }, + "latency": { + "default": "normal", + "description": "Controls the trade-off between response time and audio quality.", + "enum": [ + "low", + "normal", + "balanced" + ], + "title": "Latency", + "type": "string" + } + }, + "required": [ + "text" + ], + "type": "object" + } + } + } + }, + "responses": { + "200": { + "description": "Request fulfilled, returns audio with timestamp alignment", + "headers": {}, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/TTSWithTimestampResponse" + } + } + } + }, + "400": { + "description": "Bad request", + "headers": {}, + "content": { + "application/json": { + "schema": { + "properties": { + "status": { + "title": "Status", + "type": "integer" + }, + "message": { + "title": "Message", + "type": "string" + } + }, + "required": [ + "status", + "message" + ], + "type": "object" + } + } + } + }, + "401": { + "description": "No permission -- see authorization schemes", + "headers": {}, + "content": { + "application/json": { + "schema": { + "properties": { + "status": { + "title": "Status", + "type": "integer" + }, + "message": { + "title": "Message", + "type": "string" + } + }, + "required": [ + "status", + "message" + ], + "type": "object" + } + } + } + }, + "402": { + "description": "No payment -- see charging schemes", + "headers": {}, + "content": { + "application/json": { + "schema": { + "properties": { + "status": { + "title": "Status", + "type": "integer" + }, + "message": { + "title": "Message", + "type": "string" + } + }, + "required": [ + "status", + "message" + ], + "type": "object" + } + } + } + }, + "422": { + "description": "Validation error", + "headers": {}, + "content": { + "application/json": { + "schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "loc": { + "title": "Location", + "description": "error field", + "type": "array", + "items": { + "type": "string" + } + }, + "type": { + "title": "Type", + "description": "error type", + "type": "string" + }, + "msg": { + "title": "Message", + "description": "error message", + "type": "string" + } + }, + "required": [ + "loc", + "type", + "msg" + ] + } + } + } + } + } + }, + "tags": [ + "OpenAPI v1" + ] + } + }, + "/v1/tts/with_timestamp/stream": { + "post": { + "summary": "Streaming Text to Speech with Timestamp Alignment", + "description": "Streams speech chunks with progressive timestamp alignment via Server-Sent Events (SSE). Each event contains a JSON object with `audio_base64`, `text`, and `alignment` fields. The SSE format is: `data: {\"audio_base64\": \"...\", \"text\": \"...\", \"alignment\": [...]}\\n\\n`", + "security": [ + { + "BearerAuth": [] + } + ], + "parameters": [ + { + "in": "header", + "name": "model", + "description": "Specify which TTS model to use. Only s1 supports streaming alignment.", + "required": true, + "schema": { + "type": "string", + "default": "s1", + "enum": [ + "s1" + ] + } + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "properties": { + "text": { + "description": "The text content to convert to speech.", + "title": "Text", + "type": "string" + }, + "temperature": { + "description": "Controls the randomness/expressiveness of the generated speech.", + "title": "Temperature", + "type": "number", + "default": 0.7, + "minimum": 0.0, + "maximum": 1.0 + }, + "top_p": { + "description": "Controls diversity via nucleus sampling.", + "title": "Top P", + "type": "number", + "default": 0.7, + "minimum": 0.0, + "maximum": 1.0 + }, + "reference_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The unique identifier of a voice model to use for speech synthesis.", + "title": "Reference Id" + }, + "chunk_length": { + "default": 300, + "description": "Controls the size of text segments for processing.", + "maximum": 300, + "minimum": 100, + "title": "Chunk Length", + "type": "integer" + }, + "normalize": { + "default": true, + "description": "Normalizes text for English and Chinese, improving stability for numbers.", + "title": "Normalize", + "type": "boolean" + }, + "latency": { + "default": "normal", + "description": "Controls the trade-off between response time and audio quality.", + "enum": [ + "low", + "normal", + "balanced" + ], + "title": "Latency", + "type": "string" + } + }, + "required": [ + "text" + ], + "type": "object" + } + } + } + }, + "responses": { + "200": { + "description": "Request fulfilled, returns SSE stream of audio chunks with alignment", + "headers": { + "Content-Type": { + "schema": { + "type": "string" + }, + "description": "text/event-stream" + } + }, + "content": { + "text/event-stream": { + "schema": { + "type": "string", + "description": "Server-Sent Events stream. Each event data field contains a JSON object with: audio_base64 (string), text (string, optional), alignment (array of TimestampSegment, optional)" + } + } + } + }, + "400": { + "description": "Bad request - model not supported for streaming alignment", + "headers": {}, + "content": { + "application/json": { + "schema": { + "properties": { + "status": { + "title": "Status", + "type": "integer" + }, + "message": { + "title": "Message", + "type": "string" + } + }, + "required": [ + "status", + "message" + ], + "type": "object" + } + } + } + }, + "401": { + "description": "No permission -- see authorization schemes", + "headers": {}, + "content": { + "application/json": { + "schema": { + "properties": { + "status": { + "title": "Status", + "type": "integer" + }, + "message": { + "title": "Message", + "type": "string" + } + }, + "required": [ + "status", + "message" + ], + "type": "object" + } + } + } + }, + "402": { + "description": "No payment -- see charging schemes", + "headers": {}, + "content": { + "application/json": { + "schema": { + "properties": { + "status": { + "title": "Status", + "type": "integer" + }, + "message": { + "title": "Message", + "type": "string" + } + }, + "required": [ + "status", + "message" + ], + "type": "object" + } + } + } + }, + "422": { + "description": "Validation error", + "headers": {}, + "content": { + "application/json": { + "schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "loc": { + "title": "Location", + "description": "error field", + "type": "array", + "items": { + "type": "string" + } + }, + "type": { + "title": "Type", + "description": "error type", + "type": "string" + }, + "msg": { + "title": "Message", + "description": "error message", + "type": "string" + } + }, + "required": [ + "loc", + "type", + "msg" + ] + } + } + } + } + } + }, + "tags": [ + "OpenAPI v1" + ] + } + }, "/wallet/{user_id}/package": { "get": { "summary": "Get User Package", @@ -2606,14 +3146,17 @@ }, "schemas": { "ProsodyControl": { + "description": "Controls for adjusting the prosody (rhythm and intonation) of generated speech.", "properties": { "speed": { "default": 1, + "description": "Speaking rate multiplier. Valid range: 0.5 to 2.0. 1.0 = normal speed, 0.5 = half speed, 2.0 = double speed. Useful for adjusting pacing without regenerating audio.", "title": "Speed", "type": "number" }, "volume": { "default": 0, + "description": "Volume adjustment in decibels (dB). 0 = no change, positive values = louder, negative values = quieter.", "title": "Volume", "type": "number" } @@ -2622,13 +3165,16 @@ "type": "object" }, "ReferenceAudio": { + "description": "A voice sample with its transcript, used for zero-shot voice cloning. The model will attempt to match the voice characteristics from the audio sample.", "properties": { "audio": { "format": "binary", + "description": "Raw audio bytes of the voice sample. Supported formats: WAV, MP3, FLAC. For best results, use 10-30 seconds of clear speech with minimal background noise.", "title": "Audio", "type": "string" }, "text": { + "description": "The exact transcript of what is spoken in the audio sample. Accuracy is important for voice cloning quality.", "title": "Text", "type": "string" } @@ -2860,6 +3406,75 @@ ], "title": "SampleEntity", "type": "object" + }, + "TimestampSegment": { + "description": "A segment of aligned text with its timing information", + "properties": { + "text": { + "description": "The text content of this segment", + "title": "Text", + "type": "string" + }, + "start": { + "description": "Start time in seconds", + "title": "Start", + "type": "number" + }, + "end": { + "description": "End time in seconds", + "title": "End", + "type": "number" + } + }, + "required": [ + "text", + "start", + "end" + ], + "title": "TimestampSegment", + "type": "object" + }, + "TTSWithTimestampResponse": { + "description": "TTS response with audio and timestamp alignment", + "properties": { + "audio_base64": { + "description": "Base64-encoded audio data", + "title": "Audio Base64", + "type": "string" + }, + "text": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "description": "The text that was synthesized (with emotion markers removed)", + "title": "Text" + }, + "alignment": { + "anyOf": [ + { + "items": { + "$ref": "#/components/schemas/TimestampSegment" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "description": "Word-level timestamp alignment segments", + "title": "Alignment" + } + }, + "required": [ + "audio_base64" + ], + "title": "TTSWithTimestampResponse", + "type": "object" } } } diff --git a/docs.json b/docs.json index a27854b..b8b617f 100644 --- a/docs.json +++ b/docs.json @@ -167,6 +167,8 @@ "icon": "code", "pages": [ "api-reference/endpoint/openapi-v1/text-to-speech", + "api-reference/endpoint/openapi-v1/tts-with-timestamp", + "api-reference/endpoint/openapi-v1/tts-with-timestamp-stream", "api-reference/endpoint/openapi-v1/speech-to-text", "api-reference/endpoint/websocket/tts-live" ]