From e4a5e7e77f1a56ac3e2054cd0abfa5223714a830 Mon Sep 17 00:00:00 2001 From: Matthew Willett-Jeffries Date: Thu, 1 May 2025 17:23:57 -0400 Subject: [PATCH 1/9] adds simple api endpoint for listing available voices --- examples/server/server.cpp | 75 ++++++++++++++++++++++++++++++-------- include/common.h | 10 +++++ include/tts.h | 1 + src/kokoro_model.cpp | 9 +++++ src/kokoro_model.h | 3 +- src/tts.cpp | 9 +++++ 6 files changed, 91 insertions(+), 16 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 92aae79..206f488 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -54,6 +54,7 @@ enum error_type { enum task_type { TTS, CONDITIONAL_PROMPT, + VOICES, }; using json = nlohmann::ordered_json; @@ -96,8 +97,8 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp fprintf(stdout, "request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status); } -struct simple_text_prompt_task { - simple_text_prompt_task(task_type task, std::string prompt): task(task), prompt(prompt) { +struct simple_server_task { + simple_server_task(task_type task, std::string prompt = ""): task(task), prompt(prompt) { id = rand(); time = std::chrono::steady_clock::now(); } @@ -124,11 +125,11 @@ struct simple_text_prompt_task { struct simple_task_queue { std::mutex rw_mutex; std::condition_variable condition; - std::deque queue; + std::deque queue; bool running = true; - struct simple_text_prompt_task * get_next() { - struct simple_text_prompt_task * resp; + struct simple_server_task * get_next() { + struct simple_server_task * resp; std::unique_lock lock(rw_mutex); condition.wait(lock, [&]{ return !queue.empty() || !running; @@ -148,7 +149,7 @@ struct simple_task_queue { condition.notify_all(); } - void push(struct simple_text_prompt_task * task) { + void push(struct simple_server_task * task) { std::lock_guard lock(rw_mutex); queue.push_back(task); condition.notify_one(); @@ -162,7 +163,7 @@ struct simple_response_map { std::atomic running = true; std::thread * cleanup_thread; - std::map completed; + std::map completed; void cleanup_routine() { std::unique_lock lock(rw_mutex); @@ -192,16 +193,16 @@ struct simple_response_map { updated.notify_all(); } - void push(struct simple_text_prompt_task * task) { + void push(struct simple_server_task * task) { std::unique_lock lock(rw_mutex); completed[task->id] = task; lock.unlock(); updated.notify_all(); } - struct simple_text_prompt_task * get(int id) { + struct simple_server_task * get(int id) { std::unique_lock lock(rw_mutex); - struct simple_text_prompt_task * resp = nullptr; + struct simple_server_task * resp = nullptr; try { return completed.at(id); } catch (const std::out_of_range& e) { @@ -243,14 +244,14 @@ struct worker { void loop() { while (running) { - struct simple_text_prompt_task * task = task_queue->get_next(); + struct simple_server_task * task = task_queue->get_next(); if (task) { process_task(task); } } } - void process_task(struct simple_text_prompt_task * task) { + void process_task(struct simple_server_task * task) { if (task->timed_out(task_timeout)) { return; } @@ -277,6 +278,21 @@ struct worker { task->success = true; response_map->push(task); break; + case VOICES: + if (!runner->supports_voices) { + task->message = "Voices are not supported for architecture '" + runner->arch_name() + "'."; + response_map->push(task); + break; + } + for (auto voice : list_voices(runner)) { + if (!task->message.empty()) { + task->message += ","; + } + task->message += voice; + } + task->success = true; + response_map->push(task); + break; } } }; @@ -518,6 +534,15 @@ int main(int argc, const char ** argv) { res.status = 200; }; + auto res_ok_voices = [](httplib::Response & res, const std::vector & voices) { + json json_voices = json::array(); + for (auto voice : voices) { + json_voices.push_back(voice); + } + res.set_content(safe_json_to_str(json_voices), MIMETYPE_JSON); + res.status = 200; + }; + svr->set_exception_handler([&res_error](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) { std::string message; try { @@ -614,7 +639,7 @@ int main(int argc, const char ** argv) { res_error(res, formatted_error); return; } - struct simple_text_prompt_task * task = new simple_text_prompt_task(TTS, prompt); + struct simple_server_task * task = new simple_server_task(TTS, prompt); int id = task->id; generation_configuration * conf = new generation_configuration(); std::memcpy((void*)conf, default_generation_config, sizeof(generation_configuration)); @@ -661,7 +686,7 @@ int main(int argc, const char ** argv) { task->gen_config = conf; tqueue->push(task); - struct simple_text_prompt_task * rtask = rmap->get(id); + struct simple_server_task * rtask = rmap->get(id); if (!rtask->success) { json formatted_error = format_error_response(rtask->message, ERROR_TYPE_SERVER); res_error(res, formatted_error); @@ -728,7 +753,7 @@ int main(int argc, const char ** argv) { int id = task->id; tqueue->push(task); - struct simple_text_prompt_task * rtask = rmap->get(id); + struct simple_server_task * rtask = rmap->get(id); if (!rtask->success) { json formatted_error = format_error_response(rtask->message, ERROR_TYPE_SERVER); res_error(res, formatted_error); @@ -745,6 +770,25 @@ int main(int argc, const char ** argv) { &models_json ](const httplib::Request & _, httplib::Response & res) { res_ok_json(res, models_json); + } + + const auto handle_voices = [&args, &tqueue, &rmap, &res_error, &res_ok_voices](const httplib::Request & req, httplib::Response & res) { + struct simple_server_task * task = new simple_server_task(VOICES); + int id = task->id; + tqueue->push(task); + struct simple_server_task * rtask = rmap->get(id); + if (!rtask->success) { + json formatted_error; + if (has_prefix(rtask->message, "Voices are not supported")) { + formatted_error = format_error_response(rtask->message, ERROR_TYPE_NOT_SUPPORTED); + } else { + formatted_error = format_error_response(rtask->message, ERROR_TYPE_SERVER); + } + res_error(res, formatted_error); + return; + } + std::vector voices = split(rtask->message, ","); + res_ok_voices(res, voices); }; // register API routes @@ -753,6 +797,7 @@ int main(int argc, const char ** argv) { svr->Post("/v1/audio/speech", handle_tts); svr->Post("/v1/audio/conditional-prompt", handle_conditional); svr->Get("/v1/models", handle_models); + svr->Get("/v1/audio/voices", handle_voices); // Start the server svr->new_task_queue = [&args] { diff --git a/include/common.h b/include/common.h index fc0dcdf..4f932fc 100644 --- a/include/common.h +++ b/include/common.h @@ -28,6 +28,11 @@ const std::map SUPPORTED_ARCHITECTURES = { { "orpheus", ORPHEUS_ARCH } }; +const std::map ARCHITECTURE_NAMES = { + { PARLER_TTS_ARCH, "parler-tts" }, + { KOKORO_ARCH, "kokoro" }, +}; + struct generation_configuration { generation_configuration( std::string voice = "", @@ -55,6 +60,11 @@ struct tts_runner { tts_arch arch; struct ggml_context * ctx = nullptr; float sampling_rate = 44100.0f; + bool supports_voices = false; + + std::string arch_name() { + return ARCHITECTURE_NAMES.at(arch); + } void init_build(std::vector* buf_compute_meta); void free_build(); diff --git a/include/tts.h b/include/tts.h index def032b..30e98dc 100644 --- a/include/tts.h +++ b/include/tts.h @@ -16,6 +16,7 @@ struct tts_runner * orpheus_from_file(gguf_context * meta_ctx, ggml_context * we struct tts_runner * runner_from_file(const std::string & fname, int n_threads, generation_configuration * config, bool cpu_only = true); int generate(tts_runner * runner, std::string sentence, struct tts_response * response, generation_configuration * config); void update_conditional_prompt(tts_runner * runner, const std::string file_path, const std::string prompt, bool cpu_only = true); +std::vector list_voices(tts_runner * runner); struct quantization_params { quantization_params(uint32_t n_threads, enum ggml_type quantize_type): n_threads(n_threads), quantize_type(quantize_type) {}; diff --git a/src/kokoro_model.cpp b/src/kokoro_model.cpp index a4b8dfc..f73dddb 100644 --- a/src/kokoro_model.cpp +++ b/src/kokoro_model.cpp @@ -1434,6 +1434,15 @@ int kokoro_runner::generate(std::string prompt, struct tts_response * response, return 0; } +std::vector kokoro_runner::list_voices() { + std::vector voices; + voices.reserve(model->voices.size()); + for (auto voice : model->voices) { + voices.push_back(voice.first); + } + return voices; +} + std::string get_espeak_id_from_kokoro_voice(std::string voice) { return !voice.empty() && KOKORO_LANG_TO_ESPEAK_ID.find(voice[0]) != KOKORO_LANG_TO_ESPEAK_ID.end() ? KOKORO_LANG_TO_ESPEAK_ID[voice[0]] : "gmw/en-US"; diff --git a/src/kokoro_model.h b/src/kokoro_model.h index 1985c11..cd332e6 100644 --- a/src/kokoro_model.h +++ b/src/kokoro_model.h @@ -426,6 +426,7 @@ struct kokoro_context * build_new_kokoro_context(struct kokoro_model * model, in struct kokoro_runner : tts_runner { kokoro_runner(kokoro_model * model, kokoro_context * context, single_pass_tokenizer * tokenizer, kokoro_duration_runner * drunner, phonemizer * phmzr): model(model), kctx(context), tokenizer(tokenizer), drunner(drunner), phmzr(phmzr) { tts_runner::sampling_rate = 24000.0f; + tts_runner::supports_voices = true; }; ~kokoro_runner() { if (ctx) { @@ -448,8 +449,8 @@ struct kokoro_runner : tts_runner { void init_build() { tts_runner::init_build(&kctx->buf_compute_meta); } - + std::vector list_voices(); std::vector> tokenize_chunks(std::vector clauses); void assign_weight(std::string name, ggml_tensor * tensor); void prepare_post_load(); diff --git a/src/tts.cpp b/src/tts.cpp index 348144e..0da56e4 100644 --- a/src/tts.cpp +++ b/src/tts.cpp @@ -176,6 +176,15 @@ int generate(tts_runner * runner, std::string sentence, struct tts_response * re } } +std::vector list_voices(tts_runner * runner) { + switch(runner->arch) { + case KOKORO_ARCH: + return ((kokoro_runner*)runner)->list_voices(); + default: + TTS_ABORT("%s failed. The architecture '%d' does not support #list_voices supported.", __func__, runner->arch); + } +} + void update_conditional_prompt(tts_runner * runner, const std::string file_path, const std::string prompt, bool cpu_only) { int n_threads = ((parler_tts_runner*)runner)->pctx->n_threads; ((parler_tts_runner*)runner)->update_conditional_prompt(file_path, prompt, n_threads, cpu_only); From 477c31d7d63dcc55d85d895a7edc9db223c7259e Mon Sep 17 00:00:00 2001 From: Matthew Willett-Jeffries Date: Thu, 1 May 2025 17:29:48 -0400 Subject: [PATCH 2/9] update readme --- examples/server/README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/examples/server/README.md b/examples/server/README.md index 6f30f15..4afdbfd 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -83,9 +83,16 @@ curl http://127.0.0.1:8080/v1/audio/speech \ The only required parameter is `input` otherwise generation configuration will be determined by the defaults set on server initialization, and the `response_format` will use `wav`. The `response_format` field currently supports only `wav` and `aiff` audio formats. +#### Voices + +For models that support voices a complete json list of supported voices can be queried vis the voices endpoint, `/v1/audio/voices`: + +```commandline +curl http://127.0.0.1:8080/v1/audio/voices +``` + ### Future Work Future work will include: * Support for token authentication and permissioning -* Multiple model support * Streaming audio, for longform audio generation. From d890547661708495043c5fb9d004e22f7ea5fe72 Mon Sep 17 00:00:00 2001 From: Matthew Willett-Jeffries Date: Thu, 1 May 2025 18:09:16 -0400 Subject: [PATCH 3/9] cruft --- src/tts.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tts.cpp b/src/tts.cpp index 0da56e4..f5faf28 100644 --- a/src/tts.cpp +++ b/src/tts.cpp @@ -182,7 +182,7 @@ std::vector list_voices(tts_runner * runner) { return ((kokoro_runner*)runner)->list_voices(); default: TTS_ABORT("%s failed. The architecture '%d' does not support #list_voices supported.", __func__, runner->arch); - } + } } void update_conditional_prompt(tts_runner * runner, const std::string file_path, const std::string prompt, bool cpu_only) { From 3167ff6dddf1b5d91249c8e01202beea6ca2290a Mon Sep 17 00:00:00 2001 From: ecyht2 Date: Sun, 22 Jun 2025 16:10:10 +0800 Subject: [PATCH 4/9] fix: Fixed voices API to work with multiple models --- examples/server/server.cpp | 76 +++++++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 22 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 206f488..10c7c0d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -251,7 +251,7 @@ struct worker { } } - void process_task(struct simple_server_task * task) { + const void process_task(struct simple_server_task * task) { if (task->timed_out(task_timeout)) { return; } @@ -279,16 +279,31 @@ struct worker { response_map->push(task); break; case VOICES: - if (!runner->supports_voices) { - task->message = "Voices are not supported for architecture '" + runner->arch_name() + "'."; - response_map->push(task); - break; + // Maybe there is a better way to pass the voices rather than + // needing a custom serialized message? + // Getting all voices + std::unordered_map voice_map = {}; + for (const auto &[id, runner] : runners) { + if (!runner->supports_voices) { + continue; + } + std::string voices_string = ""; + for (auto voice : list_voices(runner)) { + if (!voices_string.empty()) { + voices_string += ","; + } + voices_string += voice; + } + voice_map[id] = voices_string; } - for (auto voice : list_voices(runner)) { + // Formatting final message + for (const auto &[id, voices] : voice_map) { if (!task->message.empty()) { - task->message += ","; + task->message += ";"; } - task->message += voice; + task->message += id; + task->message += "/"; + task->message += voices; } task->success = true; response_map->push(task); @@ -462,6 +477,7 @@ int main(int argc, const char ** argv) { svr.reset(new httplib::Server()); #endif + // Models Variables std::unordered_map model_map = {}; const std::string model_path = args.get_string_param("--model-path"); if (std::filesystem::is_directory(model_path)) { @@ -509,6 +525,9 @@ int main(int argc, const char ** argv) { } const json models_json = {{"object", "list"}, {"data", models}}; + // Voices Variables + json voices_json = nullptr; + std::atomic state{LOADING}; svr->set_logger(log_server_request); @@ -534,15 +553,6 @@ int main(int argc, const char ** argv) { res.status = 200; }; - auto res_ok_voices = [](httplib::Response & res, const std::vector & voices) { - json json_voices = json::array(); - for (auto voice : voices) { - json_voices.push_back(voice); - } - res.set_content(safe_json_to_str(json_voices), MIMETYPE_JSON); - res.status = 200; - }; - svr->set_exception_handler([&res_error](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) { std::string message; try { @@ -736,7 +746,7 @@ int main(int argc, const char ** argv) { return; } std::string prompt = data.at("input").get(); - struct simple_text_prompt_task * task = new simple_text_prompt_task(CONDITIONAL_PROMPT, prompt); + struct simple_server_task * task = new simple_server_task(CONDITIONAL_PROMPT, prompt); if (data.contains("model") && data.at("model").is_string()) { const std::string model = data.at("model"); @@ -770,10 +780,27 @@ int main(int argc, const char ** argv) { &models_json ](const httplib::Request & _, httplib::Response & res) { res_ok_json(res, models_json); - } + }; + + const auto handle_voices = [ + &args, + &tqueue, + &rmap, + &res_error, + &res_ok_json, + &voices_json, + &default_model + ](const httplib::Request & req, httplib::Response & res) { + // Using Cached Values + if (!voices_json.is_null()) { + res_ok_json(res, voices_json); + return; + } - const auto handle_voices = [&args, &tqueue, &rmap, &res_error, &res_ok_voices](const httplib::Request & req, httplib::Response & res) { struct simple_server_task * task = new simple_server_task(VOICES); + // Setting the model to default model (as dummy value) so no new runner is created + task->model = default_model; + int id = task->id; tqueue->push(task); struct simple_server_task * rtask = rmap->get(id); @@ -787,8 +814,13 @@ int main(int argc, const char ** argv) { res_error(res, formatted_error); return; } - std::vector voices = split(rtask->message, ","); - res_ok_voices(res, voices); + voices_json = json::object(); + std::vector model_voices = split(rtask->message, ";"); + for (const std::string entry : model_voices) { + const std::vector entry_split = split(entry, "/"); + voices_json[entry_split[0]] = split(entry_split[1], ","); + } + res_ok_json(res, voices_json); }; // register API routes From bb3f8767b771227636d194e827466b78d346c6b3 Mon Sep 17 00:00:00 2001 From: ecyht2 Date: Sun, 22 Jun 2025 17:34:25 +0800 Subject: [PATCH 5/9] feat: Added voice selection in server API page --- examples/server/public/index.html | 121 ++++++++++++++++++++---------- 1 file changed, 82 insertions(+), 39 deletions(-) diff --git a/examples/server/public/index.html b/examples/server/public/index.html index ffaa29c..d43cdf0 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -60,41 +60,6 @@ gap: 10px; } - select { - appearance: base-select; - flex-grow: 1; - box-sizing: border-box; - padding: 10px; - border: 1px solid #d1d5db; - border-radius: 6px; - background: none; - font-family: inherit; - font-size: 0.875rem; - transition: - border-color 0.2s, - box-shadow 0.2s; - } - - select:focus { - outline: none; - border-color: #3b82f6; - box-shadow: 0 0 0 2px rgba(59, 130, 246, 0.2); - } - - ::picker(select) { - appearance: base-select; - flex-grow: 1; - box-sizing: border-box; - padding: 10px; - border: 1px solid #d1d5db; - border-radius: 6px; - font-family: inherit; - font-size: 0.875rem; - transition: - border-color 0.2s, - box-shadow 0.2s; - } - .refresh-btn { padding-right: 9.5px; padding-left: 9.5px; @@ -136,7 +101,8 @@ } textarea, - input[type="text"] { + input[type="text"], + select { box-sizing: border-box; width: 100%; padding: 10px; @@ -148,7 +114,8 @@ } textarea:focus, - input[type="text"]:focus { + input[type="text"]:focus, + select:focus { outline: none; border-color: #3b82f6; box-shadow: 0 0 0 2px rgba(59, 130, 246, 0.2); @@ -159,6 +126,25 @@ resize: vertical; } + select { + appearance: base-select; + background: none; + } + + ::picker(select) { + appearance: base-select; + flex-grow: 1; + box-sizing: border-box; + padding: 10px; + border: 1px solid #d1d5db; + border-radius: 6px; + font-family: inherit; + font-size: 0.875rem; + transition: + border-color 0.2s, + box-shadow 0.2s; + } + .slider-container { margin-top: 8px; } @@ -369,6 +355,14 @@

TTS.cpp Server API

API key for authentication (does nothing for now)

+
+ + +

Voice to use for the speech (not all model have voices)

+
+