From fec9901d134bf28c990f762df6501e7ee01d26e0 Mon Sep 17 00:00:00 2001 From: James Ding Date: Thu, 4 Dec 2025 22:28:08 -0600 Subject: [PATCH 1/3] fix: improve docs for openapi.json, expose new params for tts --- api-reference/openapi.json | 102 ++++++++++++++++++++++++++++++++----- 1 file changed, 88 insertions(+), 14 deletions(-) diff --git a/api-reference/openapi.json b/api-reference/openapi.json index b9eeac7..b404b66 100644 --- a/api-reference/openapi.json +++ b/api-reference/openapi.json @@ -43,12 +43,12 @@ "schema": { "properties": { "text": { - "description": "Text to be converted to speech", + "description": "The text content to convert to speech. Supports multiple languages including English, Chinese, Japanese, and more. For best results, use proper punctuation and avoid mixing too many languages in a single request.", "title": "Text", "type": "string" }, "temperature": { - "description": "Controls randomness in the speech generation. Higher values (e.g., 1.0) make the output more random, while lower values (e.g., 0.1) make it more deterministic. We recommend `0.9` for `s1` model", + "description": "Controls the randomness/expressiveness of the generated speech. Higher values (e.g., 0.9) produce more varied and expressive speech with natural prosody variations. Lower values (e.g., 0.5) produce more consistent but potentially monotonous output. Recommended: 0.9 for s1 model.", "title": "Temperature", "type": "number", "default": 0.9, @@ -56,7 +56,7 @@ "maximum": 1.0 }, "top_p": { - "description": "Controls diversity via nucleus sampling. Lower values (e.g., 0.1) make the output more focused, while higher values (e.g., 1.0) allow more diversity. We recommend `0.9` for `s1` model", + "description": "Controls diversity via nucleus sampling. Limits token selection to the smallest set whose cumulative probability exceeds this value. Lower values (e.g., 0.5) produce more focused/predictable speech, higher values (e.g., 0.95) allow more variety. Recommended: 0.9 for s1 model.", "title": "Top P", "type": "number", "default": 0.9, @@ -75,7 +75,7 @@ "type": "null" } ], - "description": "References to be used for the speech, this requires MessagePack serialization, this will override reference_voices and reference_texts", + "description": "Custom voice references for zero-shot voice cloning. Provide audio samples with their transcripts to clone a voice on-the-fly without creating a model. Requires MessagePack serialization (not available via JSON). Takes precedence over reference_id if both are provided. For best results, use 10-30 seconds of clean audio.", "title": "References" }, "reference_id": { @@ -88,7 +88,7 @@ } ], "default": null, - "description": "ID of the reference model o be used for the speech", + "description": "The unique identifier of a voice model to use for speech synthesis. You can find model IDs in the Fish Audio voice library or use your own custom-trained models. If not specified and no references provided, a default voice will be used.", "title": "Reference Id" }, "prosody": { @@ -101,11 +101,11 @@ } ], "default": null, - "description": "Prosody to be used for the speech" + "description": "Fine-tune the speech output with speed and volume adjustments. Speed controls speaking rate (1.0 = normal, <1.0 = slower, >1.0 = faster). Volume adjusts loudness in decibels (0 = unchanged, positive = louder, negative = quieter)." }, "chunk_length": { "default": 200, - "description": "Chunk length to be used for the speech", + "description": "Controls the size of text segments for processing. The text is split into chunks before synthesis. Higher values (e.g., 300) produce longer continuous speech segments but increase memory usage and latency. Lower values (e.g., 100) reduce latency but may affect naturalness at chunk boundaries. Recommended: 200 for balanced quality/latency.", "maximum": 300, "minimum": 100, "title": "Chunk Length", @@ -113,13 +113,13 @@ }, "normalize": { "default": true, - "description": "Whether to normalize the speech, this will reduce the latency but may reduce performance on numbers and dates", + "description": "Normalizes text for English and Chinese, improving stability for numbers. When enabled, preprocesses text to improve pronunciation consistency. Disable if you need precise control over how text is spoken.", "title": "Normalize", "type": "boolean" }, "format": { "default": "mp3", - "description": "Format to be used for the speech", + "description": "Output audio format. 'mp3': Compressed, widely compatible, good for streaming (44.1kHz). 'wav': Uncompressed, highest quality, larger file size (44.1kHz). 'pcm': Raw 16-bit audio data, useful for real-time processing (44.1kHz). 'opus': Modern compressed format, excellent quality at low bitrates, ideal for web applications (48kHz).", "enum": [ "wav", "pcm", @@ -139,12 +139,12 @@ } ], "default": null, - "description": "Sample rate to be used for the speech", + "description": "Audio sample rate in Hz. When null, uses the format's default (44100 Hz for most formats). Common values: 44100 (CD quality), 22050 (reduced size), 16000 (speech-optimized). Lower sample rates reduce file size but may affect audio quality.", "title": "Sample Rate" }, "mp3_bitrate": { "default": 128, - "description": "MP3 Bitrate to be used for the speech", + "description": "Bitrate for MP3 output in kbps. Only applies when format='mp3'. 64: Smaller files, reduced quality. 128: Good balance of quality and size (recommended). 192: Higher quality, larger files.", "enum": [ 64, 128, @@ -155,7 +155,7 @@ }, "opus_bitrate": { "default": 32, - "description": "Opus Bitrate to be used for the speech", + "description": "Bitrate for Opus output in kbps. Only applies when format='opus'. -1000: Automatic bitrate selection based on content. 24-64: Manual bitrate selection. Higher values improve quality but increase file size. Opus is very efficient, so even 32kbps provides good speech quality.", "enum": [ -1000, 24, @@ -168,13 +168,47 @@ }, "latency": { "default": "normal", - "description": "Latency to be used for the speech, balanced will reduce the latency but may lead to performance degradation", + "description": "Controls the trade-off between response time and audio quality. 'normal': Best quality, standard latency - recommended for non-real-time applications. 'balanced': Reduced time-to-first-byte at the cost of slightly reduced quality - recommended for real-time/streaming applications where responsiveness matters.", "enum": [ "normal", "balanced" ], "title": "Latency", "type": "string" + }, + "max_new_tokens": { + "default": 1024, + "description": "Maximum number of audio tokens to generate per text chunk. Higher values allow generating longer audio segments per chunk. Reduce if experiencing memory issues or if you only need short utterances. Most users should leave this at the default.", + "title": "Max New Tokens", + "type": "integer" + }, + "repetition_penalty": { + "default": 1.2, + "description": "Discourages the model from repeating similar audio patterns. Values > 1.0 reduce repetition (1.2 is recommended). Higher values (e.g., 1.5) more aggressively prevent loops but may affect naturalness. Values closer to 1.0 allow more repetition. Useful for preventing audio artifacts in longer generations.", + "title": "Repetition Penalty", + "type": "number" + }, + "min_chunk_length": { + "default": 50, + "description": "Minimum number of characters required before splitting text into a new chunk. Prevents creation of very short audio segments that might sound unnatural. Higher values ensure longer continuous segments but may increase latency for the first audio. Works in conjunction with chunk_length.", + "minimum": 0, + "maximum": 100, + "title": "Min Chunk Length", + "type": "integer" + }, + "condition_on_previous_chunks": { + "default": true, + "description": "Enables voice consistency across multiple text chunks. When true, uses the audio from previous chunks as context for generating subsequent chunks, maintaining consistent voice characteristics throughout long texts. When false, each chunk is generated independently. Disable only if you want deliberately varied intonation between segments.", + "title": "Condition On Previous Chunks", + "type": "boolean" + }, + "early_stop_threshold": { + "default": 1.0, + "description": "Controls when to stop audio generation early during batch processing. Value of 1.0 means wait for all samples to complete (best quality). Lower values (e.g., 0.8) stop when 80% of samples are done, potentially reducing latency but may cut off audio prematurely. Most users should leave this at 1.0.", + "title": "Early Stop Threshold", + "type": "number", + "minimum": 0.0, + "maximum": 1.0 } }, "required": [ @@ -232,7 +266,7 @@ } ], "default": null, - "description": "ID of the reference model o be used for the speech", + "description": "The unique identifier of a voice model to use for speech synthesis. You can find model IDs in the Fish Audio voice library or use your own custom-trained models. If not specified and no references provided, a default voice will be used.", "title": "Reference Id" }, "prosody": { @@ -319,6 +353,40 @@ ], "title": "Latency", "type": "string" + }, + "max_new_tokens": { + "default": 1024, + "description": "Maximum number of tokens to generate per chunk. Each chunk can produce up to ~25 seconds of audio", + "title": "Max New Tokens", + "type": "integer" + }, + "repetition_penalty": { + "default": 1.2, + "description": "Penalty for repeating tokens. Higher values reduce repetition in the generated speech", + "title": "Repetition Penalty", + "type": "number" + }, + "min_chunk_length": { + "default": 50, + "description": "Minimum chunk length for text splitting", + "minimum": 0, + "maximum": 100, + "title": "Min Chunk Length", + "type": "integer" + }, + "condition_on_previous_chunks": { + "default": true, + "description": "Whether to condition generation on previous chunks. If references are provided, uses references only; otherwise uses first chunk as reference", + "title": "Condition On Previous Chunks", + "type": "boolean" + }, + "early_stop_threshold": { + "default": 1.0, + "description": "Early stop if this fraction of samples are finished", + "title": "Early Stop Threshold", + "type": "number", + "minimum": 0.0, + "maximum": 1.0 } }, "required": [ @@ -2606,14 +2674,17 @@ }, "schemas": { "ProsodyControl": { + "description": "Controls for adjusting the prosody (rhythm and intonation) of generated speech.", "properties": { "speed": { "default": 1, + "description": "Speaking rate multiplier. Valid range: 0.5 to 2.0. 1.0 = normal speed, 0.5 = half speed, 2.0 = double speed. Useful for adjusting pacing without regenerating audio.", "title": "Speed", "type": "number" }, "volume": { "default": 0, + "description": "Volume adjustment in decibels (dB). 0 = no change, positive values = louder, negative values = quieter.", "title": "Volume", "type": "number" } @@ -2622,13 +2693,16 @@ "type": "object" }, "ReferenceAudio": { + "description": "A voice sample with its transcript, used for zero-shot voice cloning. The model will attempt to match the voice characteristics from the audio sample.", "properties": { "audio": { "format": "binary", + "description": "Raw audio bytes of the voice sample. Supported formats: WAV, MP3, FLAC. For best results, use 10-30 seconds of clear speech with minimal background noise.", "title": "Audio", "type": "string" }, "text": { + "description": "The exact transcript of what is spoken in the audio sample. Accuracy is important for voice cloning quality.", "title": "Text", "type": "string" } From 4f35a1dc6dada9e3aca55aeddc3bd1b25c98c3e0 Mon Sep 17 00:00:00 2001 From: James Ding Date: Tue, 9 Dec 2025 20:09:21 -0600 Subject: [PATCH 2/3] fix: update descriptions and defaults in openapi.json for speech parameters --- api-reference/openapi.json | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/api-reference/openapi.json b/api-reference/openapi.json index b404b66..73c6dbf 100644 --- a/api-reference/openapi.json +++ b/api-reference/openapi.json @@ -48,18 +48,18 @@ "type": "string" }, "temperature": { - "description": "Controls the randomness/expressiveness of the generated speech. Higher values (e.g., 0.9) produce more varied and expressive speech with natural prosody variations. Lower values (e.g., 0.5) produce more consistent but potentially monotonous output. Recommended: 0.9 for s1 model.", + "description": "Controls the randomness/expressiveness of the generated speech. Higher values produce more varied and expressive speech, lower values produce more consistent output.", "title": "Temperature", "type": "number", - "default": 0.9, + "default": 0.7, "minimum": 0.0, "maximum": 1.0 }, "top_p": { - "description": "Controls diversity via nucleus sampling. Limits token selection to the smallest set whose cumulative probability exceeds this value. Lower values (e.g., 0.5) produce more focused/predictable speech, higher values (e.g., 0.95) allow more variety. Recommended: 0.9 for s1 model.", + "description": "Controls diversity via nucleus sampling. Lower values produce more focused/predictable speech, higher values allow more variety.", "title": "Top P", "type": "number", - "default": 0.9, + "default": 0.7, "minimum": 0.0, "maximum": 1.0 }, @@ -104,8 +104,8 @@ "description": "Fine-tune the speech output with speed and volume adjustments. Speed controls speaking rate (1.0 = normal, <1.0 = slower, >1.0 = faster). Volume adjusts loudness in decibels (0 = unchanged, positive = louder, negative = quieter)." }, "chunk_length": { - "default": 200, - "description": "Controls the size of text segments for processing. The text is split into chunks before synthesis. Higher values (e.g., 300) produce longer continuous speech segments but increase memory usage and latency. Lower values (e.g., 100) reduce latency but may affect naturalness at chunk boundaries. Recommended: 200 for balanced quality/latency.", + "default": 300, + "description": "Controls the size of text segments for processing. The text is split into chunks before synthesis. Higher values produce longer continuous speech segments but increase memory usage and latency. Lower values reduce latency but may affect naturalness at chunk boundaries.", "maximum": 300, "minimum": 100, "title": "Chunk Length", @@ -113,7 +113,7 @@ }, "normalize": { "default": true, - "description": "Normalizes text for English and Chinese, improving stability for numbers. When enabled, preprocesses text to improve pronunciation consistency. Disable if you need precise control over how text is spoken.", + "description": "Normalizes text for English and Chinese, improving stability for numbers. When enabled, preprocesses text to improve pronunciation consistency.", "title": "Normalize", "type": "boolean" }, @@ -154,8 +154,8 @@ "type": "integer" }, "opus_bitrate": { - "default": 32, - "description": "Bitrate for Opus output in kbps. Only applies when format='opus'. -1000: Automatic bitrate selection based on content. 24-64: Manual bitrate selection. Higher values improve quality but increase file size. Opus is very efficient, so even 32kbps provides good speech quality.", + "default": -1000, + "description": "Bitrate for Opus output in kbps. Only applies when format='opus'. -1000: Automatic bitrate selection based on content (recommended). 24-64: Manual bitrate selection. Higher values improve quality but increase file size.", "enum": [ -1000, 24, @@ -168,8 +168,9 @@ }, "latency": { "default": "normal", - "description": "Controls the trade-off between response time and audio quality. 'normal': Best quality, standard latency - recommended for non-real-time applications. 'balanced': Reduced time-to-first-byte at the cost of slightly reduced quality - recommended for real-time/streaming applications where responsiveness matters.", + "description": "Controls the trade-off between response time and audio quality. 'normal': Best quality, standard latency - recommended for non-real-time applications. 'balanced': Reduced time-to-first-byte at the cost of slightly reduced quality - recommended for real-time/streaming applications. 'low': Lowest latency, may have further quality trade-offs.", "enum": [ + "low", "normal", "balanced" ], @@ -184,7 +185,7 @@ }, "repetition_penalty": { "default": 1.2, - "description": "Discourages the model from repeating similar audio patterns. Values > 1.0 reduce repetition (1.2 is recommended). Higher values (e.g., 1.5) more aggressively prevent loops but may affect naturalness. Values closer to 1.0 allow more repetition. Useful for preventing audio artifacts in longer generations.", + "description": "Discourages the model from repeating similar audio patterns. Values > 1.0 reduce repetition. Higher values more aggressively prevent loops but may affect naturalness.", "title": "Repetition Penalty", "type": "number" }, @@ -226,7 +227,7 @@ "type": "string" }, "temperature": { - "description": "Controls randomness in the speech generation. Higher values (e.g., 1.0) make the output more random, while lower values (e.g., 0.1) make it more deterministic", + "description": "Controls the randomness/expressiveness of the generated speech. Higher values produce more varied and expressive speech, lower values produce more consistent output.", "title": "Temperature", "type": "number", "default": 0.7, @@ -234,7 +235,7 @@ "maximum": 1.0 }, "top_p": { - "description": "Controls diversity via nucleus sampling. Lower values (e.g., 0.1) make the output more focused, while higher values (e.g., 1.0) allow more diversity", + "description": "Controls diversity via nucleus sampling. Lower values produce more focused/predictable speech, higher values allow more variety.", "title": "Top P", "type": "number", "default": 0.7, @@ -282,7 +283,7 @@ "description": "Prosody to be used for the speech" }, "chunk_length": { - "default": 200, + "default": 300, "description": "Chunk length to be used for the speech", "maximum": 300, "minimum": 100, @@ -291,7 +292,7 @@ }, "normalize": { "default": true, - "description": "Whether to normalize the speech, this will reduce the latency but may reduce performance on numbers and dates", + "description": "Normalizes text for English and Chinese, improving stability for numbers. When enabled, preprocesses text to improve pronunciation consistency.", "title": "Normalize", "type": "boolean" }, @@ -332,7 +333,7 @@ "type": "integer" }, "opus_bitrate": { - "default": 32, + "default": -1000, "description": "Opus Bitrate to be used for the speech", "enum": [ -1000, @@ -348,6 +349,7 @@ "default": "normal", "description": "Latency to be used for the speech, balanced will reduce the latency but may lead to performance degradation", "enum": [ + "low", "normal", "balanced" ], @@ -362,7 +364,7 @@ }, "repetition_penalty": { "default": 1.2, - "description": "Penalty for repeating tokens. Higher values reduce repetition in the generated speech", + "description": "Discourages the model from repeating similar audio patterns. Values > 1.0 reduce repetition. Higher values more aggressively prevent loops but may affect naturalness.", "title": "Repetition Penalty", "type": "number" }, From e04eb6b58a31abc14510c25d4d821ca19b6e1a54 Mon Sep 17 00:00:00 2001 From: James Ding Date: Fri, 19 Dec 2025 14:41:28 -0800 Subject: [PATCH 3/3] feat: add Text to Speech endpoints with timestamp alignment and documentation --- .../openapi-v1/tts-with-timestamp.mdx | 49 ++ api-reference/openapi.json | 539 ++++++++++++++++++ docs.json | 2 + 3 files changed, 590 insertions(+) create mode 100644 api-reference/endpoint/openapi-v1/tts-with-timestamp.mdx diff --git a/api-reference/endpoint/openapi-v1/tts-with-timestamp.mdx b/api-reference/endpoint/openapi-v1/tts-with-timestamp.mdx new file mode 100644 index 0000000..a02eea9 --- /dev/null +++ b/api-reference/endpoint/openapi-v1/tts-with-timestamp.mdx @@ -0,0 +1,49 @@ +--- +openapi: post /v1/tts/with_timestamp +title: 'Text to Speech with Timestamp' +description: 'Generate speech with word-level timestamp alignment' +icon: "clock" +iconType: "solid" +--- + + +This endpoint generates complete audio first, then aligns it with the input text to provide precise timing information for each segment. The response includes both the audio and an array of timestamp segments. + + +## Response Format + +The response is a JSON object containing: + +| Field | Type | Description | +|-------|------|-------------| +| `audio_base64` | string | Base64-encoded audio data | +| `text` | string | The synthesized text (with emotion markers removed) | +| `alignment` | array | Array of timestamp segments | + +Each timestamp segment contains: + +| Field | Type | Description | +|-------|------|-------------| +| `text` | string | The text content of this segment | +| `start` | number | Start time in seconds | +| `end` | number | End time in seconds | + +## Example Response + +```json +{ + "audio_base64": "UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8...", + "text": "Hello, world!", + "alignment": [ + {"text": "Hello,", "start": 0.0, "end": 0.45}, + {"text": "world!", "start": 0.52, "end": 1.1} + ] +} +``` + +## Use Cases + +- **Subtitle generation**: Automatically create synchronized subtitles for video content +- **Karaoke-style highlighting**: Highlight words as they are spoken +- **Accessibility features**: Provide visual indicators synchronized with audio playback +- **Audio editing**: Precisely locate and edit specific words in generated speech \ No newline at end of file diff --git a/api-reference/openapi.json b/api-reference/openapi.json index 73c6dbf..64a29fe 100644 --- a/api-reference/openapi.json +++ b/api-reference/openapi.json @@ -752,6 +752,476 @@ ] } }, + "/v1/tts/with_timestamp": { + "post": { + "summary": "Text to Speech with Timestamp Alignment", + "description": "Generates speech from text and returns word-level timestamp alignment. The complete audio is generated first, then aligned with the text to provide precise timing information for each segment.", + "security": [ + { + "BearerAuth": [] + } + ], + "parameters": [ + { + "in": "header", + "name": "model", + "description": "Specify which TTS model to use. We recommend `s1`", + "required": true, + "schema": { + "type": "string", + "default": "s1", + "enum": [ + "s1", + "speech-1.6", + "speech-1.5" + ] + } + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "properties": { + "text": { + "description": "The text content to convert to speech.", + "title": "Text", + "type": "string" + }, + "temperature": { + "description": "Controls the randomness/expressiveness of the generated speech.", + "title": "Temperature", + "type": "number", + "default": 0.7, + "minimum": 0.0, + "maximum": 1.0 + }, + "top_p": { + "description": "Controls diversity via nucleus sampling.", + "title": "Top P", + "type": "number", + "default": 0.7, + "minimum": 0.0, + "maximum": 1.0 + }, + "reference_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The unique identifier of a voice model to use for speech synthesis.", + "title": "Reference Id" + }, + "chunk_length": { + "default": 300, + "description": "Controls the size of text segments for processing.", + "maximum": 300, + "minimum": 100, + "title": "Chunk Length", + "type": "integer" + }, + "normalize": { + "default": true, + "description": "Normalizes text for English and Chinese, improving stability for numbers.", + "title": "Normalize", + "type": "boolean" + }, + "latency": { + "default": "normal", + "description": "Controls the trade-off between response time and audio quality.", + "enum": [ + "low", + "normal", + "balanced" + ], + "title": "Latency", + "type": "string" + } + }, + "required": [ + "text" + ], + "type": "object" + } + } + } + }, + "responses": { + "200": { + "description": "Request fulfilled, returns audio with timestamp alignment", + "headers": {}, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/TTSWithTimestampResponse" + } + } + } + }, + "400": { + "description": "Bad request", + "headers": {}, + "content": { + "application/json": { + "schema": { + "properties": { + "status": { + "title": "Status", + "type": "integer" + }, + "message": { + "title": "Message", + "type": "string" + } + }, + "required": [ + "status", + "message" + ], + "type": "object" + } + } + } + }, + "401": { + "description": "No permission -- see authorization schemes", + "headers": {}, + "content": { + "application/json": { + "schema": { + "properties": { + "status": { + "title": "Status", + "type": "integer" + }, + "message": { + "title": "Message", + "type": "string" + } + }, + "required": [ + "status", + "message" + ], + "type": "object" + } + } + } + }, + "402": { + "description": "No payment -- see charging schemes", + "headers": {}, + "content": { + "application/json": { + "schema": { + "properties": { + "status": { + "title": "Status", + "type": "integer" + }, + "message": { + "title": "Message", + "type": "string" + } + }, + "required": [ + "status", + "message" + ], + "type": "object" + } + } + } + }, + "422": { + "description": "Validation error", + "headers": {}, + "content": { + "application/json": { + "schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "loc": { + "title": "Location", + "description": "error field", + "type": "array", + "items": { + "type": "string" + } + }, + "type": { + "title": "Type", + "description": "error type", + "type": "string" + }, + "msg": { + "title": "Message", + "description": "error message", + "type": "string" + } + }, + "required": [ + "loc", + "type", + "msg" + ] + } + } + } + } + } + }, + "tags": [ + "OpenAPI v1" + ] + } + }, + "/v1/tts/with_timestamp/stream": { + "post": { + "summary": "Streaming Text to Speech with Timestamp Alignment", + "description": "Streams speech chunks with progressive timestamp alignment via Server-Sent Events (SSE). Each event contains a JSON object with `audio_base64`, `text`, and `alignment` fields. The SSE format is: `data: {\"audio_base64\": \"...\", \"text\": \"...\", \"alignment\": [...]}\\n\\n`", + "security": [ + { + "BearerAuth": [] + } + ], + "parameters": [ + { + "in": "header", + "name": "model", + "description": "Specify which TTS model to use. Only s1 supports streaming alignment.", + "required": true, + "schema": { + "type": "string", + "default": "s1", + "enum": [ + "s1" + ] + } + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "properties": { + "text": { + "description": "The text content to convert to speech.", + "title": "Text", + "type": "string" + }, + "temperature": { + "description": "Controls the randomness/expressiveness of the generated speech.", + "title": "Temperature", + "type": "number", + "default": 0.7, + "minimum": 0.0, + "maximum": 1.0 + }, + "top_p": { + "description": "Controls diversity via nucleus sampling.", + "title": "Top P", + "type": "number", + "default": 0.7, + "minimum": 0.0, + "maximum": 1.0 + }, + "reference_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "The unique identifier of a voice model to use for speech synthesis.", + "title": "Reference Id" + }, + "chunk_length": { + "default": 300, + "description": "Controls the size of text segments for processing.", + "maximum": 300, + "minimum": 100, + "title": "Chunk Length", + "type": "integer" + }, + "normalize": { + "default": true, + "description": "Normalizes text for English and Chinese, improving stability for numbers.", + "title": "Normalize", + "type": "boolean" + }, + "latency": { + "default": "normal", + "description": "Controls the trade-off between response time and audio quality.", + "enum": [ + "low", + "normal", + "balanced" + ], + "title": "Latency", + "type": "string" + } + }, + "required": [ + "text" + ], + "type": "object" + } + } + } + }, + "responses": { + "200": { + "description": "Request fulfilled, returns SSE stream of audio chunks with alignment", + "headers": { + "Content-Type": { + "schema": { + "type": "string" + }, + "description": "text/event-stream" + } + }, + "content": { + "text/event-stream": { + "schema": { + "type": "string", + "description": "Server-Sent Events stream. Each event data field contains a JSON object with: audio_base64 (string), text (string, optional), alignment (array of TimestampSegment, optional)" + } + } + } + }, + "400": { + "description": "Bad request - model not supported for streaming alignment", + "headers": {}, + "content": { + "application/json": { + "schema": { + "properties": { + "status": { + "title": "Status", + "type": "integer" + }, + "message": { + "title": "Message", + "type": "string" + } + }, + "required": [ + "status", + "message" + ], + "type": "object" + } + } + } + }, + "401": { + "description": "No permission -- see authorization schemes", + "headers": {}, + "content": { + "application/json": { + "schema": { + "properties": { + "status": { + "title": "Status", + "type": "integer" + }, + "message": { + "title": "Message", + "type": "string" + } + }, + "required": [ + "status", + "message" + ], + "type": "object" + } + } + } + }, + "402": { + "description": "No payment -- see charging schemes", + "headers": {}, + "content": { + "application/json": { + "schema": { + "properties": { + "status": { + "title": "Status", + "type": "integer" + }, + "message": { + "title": "Message", + "type": "string" + } + }, + "required": [ + "status", + "message" + ], + "type": "object" + } + } + } + }, + "422": { + "description": "Validation error", + "headers": {}, + "content": { + "application/json": { + "schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "loc": { + "title": "Location", + "description": "error field", + "type": "array", + "items": { + "type": "string" + } + }, + "type": { + "title": "Type", + "description": "error type", + "type": "string" + }, + "msg": { + "title": "Message", + "description": "error message", + "type": "string" + } + }, + "required": [ + "loc", + "type", + "msg" + ] + } + } + } + } + } + }, + "tags": [ + "OpenAPI v1" + ] + } + }, "/wallet/{user_id}/package": { "get": { "summary": "Get User Package", @@ -2936,6 +3406,75 @@ ], "title": "SampleEntity", "type": "object" + }, + "TimestampSegment": { + "description": "A segment of aligned text with its timing information", + "properties": { + "text": { + "description": "The text content of this segment", + "title": "Text", + "type": "string" + }, + "start": { + "description": "Start time in seconds", + "title": "Start", + "type": "number" + }, + "end": { + "description": "End time in seconds", + "title": "End", + "type": "number" + } + }, + "required": [ + "text", + "start", + "end" + ], + "title": "TimestampSegment", + "type": "object" + }, + "TTSWithTimestampResponse": { + "description": "TTS response with audio and timestamp alignment", + "properties": { + "audio_base64": { + "description": "Base64-encoded audio data", + "title": "Audio Base64", + "type": "string" + }, + "text": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "description": "The text that was synthesized (with emotion markers removed)", + "title": "Text" + }, + "alignment": { + "anyOf": [ + { + "items": { + "$ref": "#/components/schemas/TimestampSegment" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "description": "Word-level timestamp alignment segments", + "title": "Alignment" + } + }, + "required": [ + "audio_base64" + ], + "title": "TTSWithTimestampResponse", + "type": "object" } } } diff --git a/docs.json b/docs.json index a27854b..b8b617f 100644 --- a/docs.json +++ b/docs.json @@ -167,6 +167,8 @@ "icon": "code", "pages": [ "api-reference/endpoint/openapi-v1/text-to-speech", + "api-reference/endpoint/openapi-v1/tts-with-timestamp", + "api-reference/endpoint/openapi-v1/tts-with-timestamp-stream", "api-reference/endpoint/openapi-v1/speech-to-text", "api-reference/endpoint/websocket/tts-live" ]