diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..907598d --- /dev/null +++ b/.clang-format @@ -0,0 +1,162 @@ +# Adapted from llama.cpp +# fab5d30ff6729ff6ff615c41e8c0215d6bc30393 by Diego Devesa +--- +Language: Cpp +AlignAfterOpenBracket: Align +AlignArrayOfStructures: Left +AlignConsecutiveAssignments: AcrossComments +AlignConsecutiveBitFields: AcrossComments +AlignConsecutiveDeclarations: AcrossComments +AlignConsecutiveMacros: AcrossComments +# AlignConsecutiveShortCaseStatements: AcrossComments +AlignEscapedNewlines: Left # LeftWithLastLine +AlignOperands: Align +AlignTrailingComments: + Kind: Always + OverEmptyLines: 1 +AllowAllArgumentsOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: false +# AllowBreakBeforeNoexceptSpecifier: OnlyWithParen +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: Inline +AllowShortIfStatementsOnASingleLine: Never +AllowShortLambdasOnASingleLine: Inline +AllowShortLoopsOnASingleLine: false +AlwaysBreakBeforeMultilineStrings: true +BinPackArguments: true +BinPackParameters: true # OnePerLine +BitFieldColonSpacing: Both +BreakBeforeBraces: Custom # Attach +BraceWrapping: + AfterCaseLabel: true + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + BeforeLambdaBody: false + BeforeWhile: false + IndentBraces: false + SplitEmptyFunction: false + SplitEmptyRecord: false + SplitEmptyNamespace: false +# BreakAdjacentStringLiterals: true +BreakAfterAttributes: Never +BreakBeforeBinaryOperators: None +BreakBeforeInlineASMColon: OnlyMultiline +BreakBeforeTernaryOperators: false +# BreakBinaryOperations: Never +BreakConstructorInitializers: AfterColon +# BreakFunctionDefinitionParameters: false +BreakInheritanceList: AfterComma +BreakStringLiterals: true +# BreakTemplateDeclarations: Yes +ColumnLimit: 120 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: false +DerivePointerAlignment: false +DisableFormat: false +EmptyLineBeforeAccessModifier: Leave +EmptyLineAfterAccessModifier: Never +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +IncludeBlocks: Regroup +IncludeCategories: + - Regex: '^<.*\.h>' + Priority: 1 + SortPriority: 0 + - Regex: '^<.*' + Priority: 2 + SortPriority: 0 + - Regex: '.*' + Priority: 3 + SortPriority: 0 +IncludeIsMainRegex: '([-_](test|unittest))?$' +IncludeIsMainSourceRegex: '' +IndentAccessModifiers: false +IndentCaseBlocks: true +IndentCaseLabels: true +IndentExternBlock: NoIndent +IndentGotoLabels: false +IndentPPDirectives: AfterHash +IndentWidth: 4 +IndentWrappedFunctionNames: false +InsertBraces: true # NOTE: may lead to incorrect formatting +InsertNewlineAtEOF: true +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +LambdaBodyIndentation: Signature +LineEnding: LF +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Auto +ObjCBlockIndentWidth: 4 +ObjCSpaceAfterProperty: true +ObjCSpaceBeforeProtocolList: true +PPIndentWidth: -1 +PackConstructorInitializers: CurrentLine +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 1 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 200 +PointerAlignment: Middle +QualifierAlignment: Left +#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict'] +RawStringFormats: + - Language: Cpp + Delimiters: + - cc + - CC + - cpp + - Cpp + - CPP + - 'c++' + - 'C++' + CanonicalDelimiter: '' +ReferenceAlignment: Middle +ReflowComments: false # IndentOnly +SeparateDefinitionBlocks: Always +SortIncludes: CaseInsensitive +SortUsingDeclarations: LexicographicNumeric +SpaceAfterCStyleCast: true +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyBlock: false +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 2 +SpacesInAngles: Never +SpacesInContainerLiterals: true +SpacesInLineCommentPrefix: + Minimum: 1 + Maximum: -1 +SpacesInParentheses: false +SpacesInSquareBrackets: false +SpaceBeforeSquareBrackets: false +Standard: c++20 +TabWidth: 4 +UseTab: Never +WhitespaceSensitiveMacros: ['STRINGIZE'] +... diff --git a/CMakeLists.txt b/CMakeLists.txt index 8fe3267..b58895c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -42,13 +42,7 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/") set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) -if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) - set(TTS_STANDALONE ON) - - include(git-vars) -else() - set(TTS_STANDALONE OFF) -endif() +include(git-vars) option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT}) @@ -82,10 +76,8 @@ set(GGML_FATAL_WARNINGS ${TTS_FATAL_WARNINGS}) # build lib -if (NOT TARGET ggml) - add_subdirectory(ggml) - # ... otherwise assume ggml is added by a parent CMakeLists.txt -endif() +add_subdirectory(ggml) +add_subdirectory(ggml-patches) add_subdirectory(src) # install tts diff --git a/examples/cli/cli.cpp b/examples/cli/cli.cpp index 103f216..1bb454a 100644 --- a/examples/cli/cli.cpp +++ b/examples/cli/cli.cpp @@ -1,10 +1,12 @@ -#include "tts.h" +#include + +#include "../../src/models/loaders.h" #include "args.h" #include "common.h" +#include "ggml.h" #include "playback.h" #include "vad.h" #include "write_file.h" -#include class tts_timing_printer { const int64_t start_us{[] { @@ -64,24 +66,24 @@ int main(int argc, const char ** argv) { exit(1); } - generation_configuration * config = new generation_configuration( - args.get_string_param("--voice"), - *args.get_int_param("--topk"), - *args.get_float_param("--temperature"), - *args.get_float_param("--repetition-penalty"), + const generation_configuration config{ + args.get_string_param("--voice"), + *args.get_int_param("--topk"), + *args.get_float_param("--temperature"), + *args.get_float_param("--repetition-penalty"), !args.get_bool_param("--no-cross-attn"), args.get_string_param("--espeak-voice-id"), *args.get_int_param("--max-tokens"), - *args.get_float_param("--top-p")); + *args.get_float_param("--top-p")}; - struct tts_runner * runner = runner_from_file(args.get_string_param("--model-path"), *args.get_int_param("--n-threads"), config, !args.get_bool_param("--use-metal")); + unique_ptr runner{runner_from_file(args.get_string_param("--model-path").c_str(), *args.get_int_param("--n-threads"), config, !args.get_bool_param("--use-metal"))}; - if (conditional_prompt.size() > 0) { - update_conditional_prompt(runner, text_encoder_path, conditional_prompt, true); + if (!conditional_prompt.empty()) { + runner->update_conditional_prompt(text_encoder_path.c_str(), conditional_prompt.c_str()); } tts_response data; - generate(runner, args.get_string_param("--prompt"), &data, config); + runner->generate(args.get_string_param("--prompt").c_str(), data, config); if (data.n_outputs == 0) { fprintf(stderr, "Got empty response for prompt, '%s'.\n", args.get_string_param("--prompt").c_str()); exit(1); @@ -92,5 +94,6 @@ int main(int argc, const char ** argv) { if (!play_tts_response(args, data, runner->sampling_rate)) { write_audio_file(data, args.get_string_param("--save-path"), runner->sampling_rate); } + static_cast(!runner.release()); // TODO the destructor doesn't work yet return 0; } diff --git a/examples/perf_battery/perf_battery.cpp b/examples/perf_battery/perf_battery.cpp index 36d0cbc..c8a1c62 100644 --- a/examples/perf_battery/perf_battery.cpp +++ b/examples/perf_battery/perf_battery.cpp @@ -1,16 +1,12 @@ -#include "tts.h" -#include "args.h" -#include "common.h" #include + #include #include #include - -std::vector ARCH_LOOKUP = { - "parler-tts", - "kokoro", -}; +#include "../../src/models/loaders.h" +#include "args.h" +#include "common.h" using perf_cb = std::function; @@ -67,15 +63,14 @@ double mean(std::vector series) { return (double) sum / series.size(); } -std::string benchmark_printout(tts_arch arch, std::vector generation_samples, std::vector output_times) { - std::string arch_name = ARCH_LOOKUP[(int)arch]; +std::string benchmark_printout(const char * arch, std::vector generation_samples, std::vector output_times) { double gen_mean = mean(generation_samples); std::vector gen_output; for (int i = 0; i < (int) output_times.size(); i++) { gen_output.push_back(generation_samples[i]/output_times[i]); } double gen_out_mean = mean(gen_output); - std::string printout = (std::string) "Mean Stats for arch " + arch_name + ":\n\n" + (std::string) " Generation Time (ms): " + std::to_string(gen_mean) + (std::string) "\n"; + std::string printout = (std::string) "Mean Stats for arch " + arch + ":\n\n" + (std::string) " Generation Time (ms): " + std::to_string(gen_mean) + (std::string) "\n"; printout += (std::string) " Generation Real Time Factor (ms): " + std::to_string(gen_out_mean) + (std::string) "\n"; return printout; } @@ -102,22 +97,23 @@ int main(int argc, const char ** argv) { } args.validate(); - generation_configuration * config = new generation_configuration(args.get_string_param("--voice"), *args.get_int_param("--topk"), *args.get_float_param("--temperature"), *args.get_float_param("--repetition-penalty"), !args.get_bool_param("--no-cross-attn")); + const generation_configuration config{args.get_string_param("--voice"), *args.get_int_param("--topk"), *args.get_float_param("--temperature"), *args.get_float_param("--repetition-penalty"), !args.get_bool_param("--no-cross-attn")}; - struct tts_runner * runner = runner_from_file(args.get_string_param("--model-path"), *args.get_int_param("--n-threads"), config, !args.get_bool_param("--use-metal")); + unique_ptr runner{runner_from_file(args.get_string_param("--model-path").c_str(), *args.get_int_param("--n-threads"), config, !args.get_bool_param("--use-metal"))}; std::vector generation_samples; std::vector output_times; for (std::string sentence : TEST_SENTENCES) { tts_response response; perf_cb cb = [&]{ - generate(runner, sentence, &response, config); + runner->generate(sentence.c_str(), response, config); }; double generation_ms = benchmark_ms(cb); output_times.push_back((double)(response.n_outputs / 44.1)); generation_samples.push_back(generation_ms); } - fprintf(stdout, "%s", benchmark_printout(runner->arch, generation_samples, output_times).c_str()); + fprintf(stdout, "%s", benchmark_printout(runner->loader.get().arch, generation_samples, output_times).c_str()); + static_cast(!runner.release()); // TODO the destructor doesn't work yet return 0; } diff --git a/examples/phonemize/phonemize.cpp b/examples/phonemize/phonemize.cpp index 83d551d..636e6f4 100644 --- a/examples/phonemize/phonemize.cpp +++ b/examples/phonemize/phonemize.cpp @@ -1,7 +1,8 @@ -#include "phonemizer.h" -#include "args.h" #include +#include "../../src/models/kokoro/phonemizer.h" +#include "args.h" + int main(int argc, const char ** argv) { arg_list args; args.add_argument(string_arg("--phonemizer-path", "(OPTIONAL) The local path of the gguf phonemiser file for TTS.cpp phonemizer. This is required if not using espeak.", "-mp")); diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt index fda21c8..254a619 100644 --- a/examples/quantize/CMakeLists.txt +++ b/examples/quantize/CMakeLists.txt @@ -1,2 +1,6 @@ -add_executable(quantize quantize.cpp) +add_executable(quantize + quantize.cpp + quantize_impl.cpp + quantize_impl.h +) target_link_libraries(quantize PRIVATE ggml tts) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 2cad888..ffb41f8 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -1,11 +1,12 @@ -#include -#include +#include #include +#include #include +#include "../../src/models/loaders.h" #include "args.h" #include "ggml.h" -#include "tts.h" +#include "quantize_impl.h" const std::map valid_quantization_types = { {"FP16", GGML_TYPE_F16}, @@ -42,12 +43,15 @@ int main(int argc, const char ** argv) { qtype.c_str()); exit(1); } - struct quantization_params * qp = new quantization_params((uint32_t) *args.get_int_param("--n-threads"), valid_quantization_types.at(qtype)); - qp->quantize_output_heads = args.get_bool_param("--quantize-output-heads"); - qp->quantize_text_embeddings = args.get_bool_param("--quantize-text-embedding"); - qp->quantize_cross_attn_kv = args.get_bool_param("--quantize-cross-attn-kv"); - qp->convert_dac_to_f16 = args.get_bool_param("--convert-dac-to-f16"); - qp->convert_non_quantizable_to_f16 = args.get_bool_param("--convert-non-quantized-to-f16"); - quantize_gguf(args.get_string_param("--model-path"), args.get_string_param("--quantized-model-path"), qp); + quantization_params qp { + .n_threads{ static_cast(*args.get_int_param("--n-threads")) }, + .quantize_type{valid_quantization_types.at(qtype)}, // quantization type + .quantize_output_heads{ args.get_bool_param("--quantize-output-heads")}, + .quantize_text_embeddings{args.get_bool_param("--quantize-text-embedding")}, + .quantize_cross_attn_kv{ args.get_bool_param("--quantize-cross-attn-kv")}, + .convert_dac_to_f16{ args.get_bool_param("--convert-dac-to-f16")}, + .convert_non_quantizable_to_f16{ args.get_bool_param("--convert-non-quantized-to-f16")}, + }; + quantize_gguf(args.get_string_param("--model-path").c_str(), args.get_string_param("--quantized-model-path").c_str(), qp); return 0; } diff --git a/examples/quantize/quantize_impl.cpp b/examples/quantize/quantize_impl.cpp new file mode 100644 index 0000000..5dce2fa --- /dev/null +++ b/examples/quantize/quantize_impl.cpp @@ -0,0 +1,293 @@ +#include "quantize_impl.h" + +#include +#include +#include +#include +#include +#include + +#include "common.h" +#include "ggml-cpp.h" +#include "util.h" + +static bool kokoro_is_f16_compatible(std::string_view name) { + return name.find("voice_tensors") == std::string::npos && name.find("bias") == std::string::npos && + name.find("gamma") == std::string::npos && name.find("beta") == std::string::npos && + name.find("alpha") == std::string::npos && !name.ends_with("embd") && !name.ends_with("norm"); +} + +static bool kokoro_is_quantizable(const std::string & name, const quantization_params & params) { + // A list of all of the top level GGUF names under kokoro.duration_predictor that have quantization compatible tensors. + static constexpr std::array DURATION_PREDICTOR_QUANTIZATION_COMPATIBLE_PARTS = { + "duration_proj", "encode", "shared_lstm", "duration_lstm", "layers" + }; + + if (kokoro_is_f16_compatible(name)) { + if (name.starts_with("kokoro.albert") || name.starts_with("kokoro.text_encoder.lstm")) { + return true; + } + if (name.starts_with("kokoro.duration_predictor.")) { + std::vector parts = split(name, "."); + for (const auto part : DURATION_PREDICTOR_QUANTIZATION_COMPATIBLE_PARTS) { + if (part == parts[2]) { + return true; + } + } + } + } + return false; +} + +static bool dia_is_quantizable(std::string_view name, const quantization_params & params) { + // The DAC audio encoder / decoder is not compatible with quantization and normalization tensors should not be quantized. + bool quantizable = !name.starts_with("audio_encoder") && !name.ends_with("norm"); + if (!params.quantize_output_heads) { + quantizable = quantizable && !name.starts_with("dia.decoder.heads"); + } + return quantizable; +} + +static bool parler_is_quanitizable(std::string_view name, const quantization_params & params) { + // the DAC audio encoder / decoder is not compatible with quantization, normalization weight shouldn't be quantized, and the text encoding shouldn't be normalized. + bool quantizable = !name.starts_with("audio_encoder") && !name.ends_with("norm.weight") && + !name.ends_with("text_encoding") && !name.ends_with("positional_embed") && + !name.ends_with("norm.bias"); + if (!params.quantize_output_heads) { + quantizable = quantizable && !name.ends_with("weight.head"); + } + if (!params.quantize_text_embeddings) { + quantizable = quantizable && !name.ends_with("embed_prompts"); + } + if (!params.quantize_cross_attn_kv) { + quantizable = quantizable && !name.ends_with("encoder_attn.k_proj.weight") && + !name.ends_with("encoder_attn.v_proj.weight"); + } + return quantizable; +} + +static bool is_quantizable(tts_arch arch, const std::string & name, const quantization_params & params) { + switch (arch) { + case PARLER_TTS_ARCH: + return parler_is_quanitizable(name, params); + case DIA_ARCH: + return dia_is_quantizable(name, params); + case KOKORO_ARCH: + return kokoro_is_quantizable(name, params); + default: + GGML_ABORT("%s failed. The architecture '%d' is not supported.", __func__, arch); + } +} + +static size_t quantize_tensor(void * new_data, const ggml_tensor * tensor, const float * imatrix, ggml_type qtype, + uint32_t n_threads) { + // much of this is form copied from llama.cpp + int chunk_size_multiplier = 1; + if (qtype == GGML_TYPE_Q4_0_4_4 || qtype == GGML_TYPE_Q4_0_4_8 || qtype == GGML_TYPE_Q4_0_8_8) { + if ((qtype == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0) || tensor->ne[1] % 4 != 0) { + qtype = GGML_TYPE_Q4_0; + } + if (qtype == GGML_TYPE_Q4_0_8_8) { + chunk_size_multiplier = 8; + } else if (qtype == GGML_TYPE_Q4_0_4_4 || qtype == GGML_TYPE_Q4_0_4_8) { + chunk_size_multiplier = 4; + } + } + size_t out_size = 0; + const int32_t d3_step = tensor->ne[0] * tensor->ne[1]; + const int32_t n_per_row = tensor->ne[0]; + const int32_t nrows = tensor->ne[1]; + static constexpr int32_t min_chunk_size = 32 * 512; + const int32_t chunk_size = + (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1) / n_per_row)) * + chunk_size_multiplier; + uint32_t thread_count = + std::max(1, std::min(static_cast(n_threads), (int) (d3_step + chunk_size - 1) / chunk_size)); + std::mutex mutex; + + for (int32_t d3_index = 0; d3_index < tensor->ne[2]; d3_index++) { + const float * f32_data_d3 = static_cast(tensor->data) + d3_index * d3_step; + void * new_data_d3 = static_cast(new_data) + ggml_row_size(qtype, tensor->ne[0]) * d3_index * nrows; + // const float * imatrix_03 = imatrix ? imatrix + d3_index * tensor->ne[0] : nullptr; + if (thread_count <= 1) { + // not threaded + out_size += ggml_quantize_chunk(qtype, f32_data_d3, new_data_d3, 0, nrows, n_per_row, imatrix); + } else { + std::vector threads; + int64_t counter = 0; + size_t new_size = 0; + bool valid = true; + for (uint32_t t = 0; t < thread_count; t++) { + auto func = [&mutex, &counter, &new_size, &valid, qtype, f32_data_d3, new_data_d3, chunk_size, nrows, + n_per_row, imatrix]() { + const int64_t nrows_per_chunk = chunk_size / n_per_row; + size_t local_size = 0; + while (true) { + std::unique_lock lock(mutex); + int64_t first_row = counter; + counter += nrows_per_chunk; + if (first_row >= nrows) { + if (local_size > 0) { + new_size += local_size; + } + break; + } + lock.unlock(); + const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk); + const size_t this_size = ggml_quantize_chunk( + qtype, f32_data_d3, new_data_d3, first_row * n_per_row, this_nrow, n_per_row, imatrix); + local_size += this_size; + + // validate the quantized data; I am not sure how this would occur, but there is always the safe fallback on doing this single threaded. + const size_t row_size = ggml_row_size(qtype, n_per_row); + void * this_data = static_cast(new_data_d3) + first_row * row_size; + if (!ggml_validate_row_data(qtype, this_data, this_size)) { + std::unique_lock lock(mutex); + valid = false; + break; + } + } + }; + threads.emplace_back(func); + } + for (auto & t : threads) { + t.join(); + } + + if (!valid) { + GGML_ABORT( + "Validation of quantized data failed. Please try again and/or switch to single thread " + "quantization.\n"); + } + out_size += new_size; + } + } + return out_size; +} + +static void zeros(std::ofstream & file, size_t n) { + char zero = 0; + for (size_t i = 0; i < n; ++i) { + file.write(&zero, 1); + } +} + +template struct no_init { + T value; + + no_init() { /* do nothing */ } +}; + +void quantize_gguf(const char * ifile, const char * ofile, const quantization_params & params) { + ggml_context * weight_ctx{}; + gguf_init_params gguf_params{ + .no_alloc{ false }, + .ctx{ &weight_ctx }, + }; + gguf_context * meta_ctx = gguf_init_from_file(ifile, gguf_params); + std::string arch = "parler-tts"; // only parler-tts gguf files should lack an explicit architecture. + + if (int arch_key = gguf_find_key(meta_ctx, "general.architecture"); arch_key != -1) { + arch = std::string(gguf_get_val_str(meta_ctx, arch_key)); + } + tts_arch arch_type = SUPPORTED_ARCHITECTURES.at(arch); + + if (params.quantize_type != GGML_TYPE_Q5_0 && params.quantize_type != GGML_TYPE_Q8_0 && + params.quantize_type != GGML_TYPE_F16 && params.quantize_type != GGML_TYPE_Q4_0) { + fprintf(stdout, "Warning, %s is untested for quantization type '%d'. Use at your own risk.\n", arch.c_str(), + params.quantize_type); + } + + gguf_context_ptr ctx_out{ gguf_init_empty() }; + + // copy the KV pairs from the input file + gguf_set_kv(ctx_out.get(), meta_ctx); + gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); + gguf_set_val_u32(ctx_out.get(), "general.quantization_type", params.quantize_type); + for (ggml_tensor * tensor = ggml_get_first_tensor(weight_ctx); tensor; + tensor = ggml_get_next_tensor(weight_ctx, tensor)) { + if (*ggml_get_name(tensor)) { + gguf_add_tensor(ctx_out.get(), tensor); + } + } + + std::vector> work; + + std::ofstream fout; + auto close_ofstream = [&]() { + // Write metadata and close file handler + if (fout.is_open()) { + fout.seekp(0); + std::vector data(gguf_get_meta_size(ctx_out.get())); + gguf_get_meta_data(ctx_out.get(), data.data()); + fout.write(reinterpret_cast(data.data()), data.size()); + fout.close(); + } + }; + auto new_ofstream = [&]() { + std::string fname = ofile; + fout = std::ofstream(fname, std::ios::binary); + fout.exceptions(std::ofstream::failbit); // fail fast on write errors + const size_t meta_size = gguf_get_meta_size(ctx_out.get()); + // placeholder for the meta data + ::zeros(fout, meta_size); + }; + new_ofstream(); + for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) { + ggml_type new_type; + void * new_data; + size_t new_size; + const char * const name = ggml_get_name(cur); + const std::string_view name_sv{ name }; + + if (!*name) { + continue; + } + + if (is_quantizable(arch_type, name, params)) { + if ((cur->type) != GGML_TYPE_F32) { + GGML_ABORT( + "ERROR: All quantized tensors must be transformed from 32bit floats. Tensor, '%s', has improper " + "type, '%d'\n", + cur->name, cur->type); + } + new_type = params.quantize_type; + if ((new_type >= GGML_TYPE_IQ2_XXS && new_type <= GGML_TYPE_IQ4_XS)) { + GGML_ABORT("ERROR: Quantization type '%d' requires an importance matrix.\n", new_type); + } + const int64_t nelement_size = ggml_nelements(cur) * 4; + if (work.size() < static_cast(nelement_size)) { + work.resize(nelement_size); // upper bound on size + } + new_data = work.data(); + new_size = quantize_tensor(new_data, cur, nullptr, new_type, params.n_threads); + } else if ((params.convert_non_quantizable_to_f16 && kokoro_is_f16_compatible(name)) || + (params.convert_dac_to_f16 && name_sv.starts_with("audio_encoder") && !name_sv.ends_with("alpha"))) { + if ((cur->type) != GGML_TYPE_F32) { + GGML_ABORT( + "ERROR: All converted tensors must be transformed from 32bit floats. Tensor, '%s', has improper " + "type, '%d'\n", + cur->name, cur->type); + } + new_type = GGML_TYPE_F16; + const int64_t nelement_size = ggml_nelements(cur) * 4; + if (work.size() < static_cast(nelement_size)) { + work.resize(nelement_size); // upper bound on size + } + new_data = work.data(); + new_size = quantize_tensor(new_data, cur, nullptr, new_type, params.n_threads); + } else { + new_type = cur->type; + new_data = cur->data; + new_size = ggml_nbytes(cur); + } + + gguf_set_tensor_type(ctx_out.get(), name, new_type); + gguf_set_tensor_data(ctx_out.get(), name, new_data, new_size); + fprintf(stdout, "At tensor: '%s' with new size: %zu bytes\n", name, new_size); + // write tensor data + padding + fout.write(static_cast(new_data), new_size); + zeros(fout, GGML_PAD(new_size, GGUF_DEFAULT_ALIGNMENT) - new_size); + } + close_ofstream(); +} diff --git a/examples/quantize/quantize_impl.h b/examples/quantize/quantize_impl.h new file mode 100644 index 0000000..4c99eaf --- /dev/null +++ b/examples/quantize/quantize_impl.h @@ -0,0 +1,15 @@ +#pragma once + +#include "ggml.h" + +struct quantization_params { + uint32_t n_threads; + ggml_type quantize_type; // quantization type + bool quantize_output_heads; + bool quantize_text_embeddings; + bool quantize_cross_attn_kv; + bool convert_dac_to_f16; + bool convert_non_quantizable_to_f16; +}; + +void quantize_gguf(const char * ifile, const char * ofile, const quantization_params & params); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 10c7c0d..bf9bd88 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -12,26 +12,27 @@ #define MIMETYPE_JSON "application/json; charset=utf-8" #define MIMETYPE_HTML "text/html; charset=utf-8" +#include + #include +#include +#include #include #include -#include #include +#include #include #include -#include #include #include -#include #include -#include -#include "tts.h" -#include "audio_file.h" + +#include "../../src/models/loaders.h" #include "args.h" +#include "audio_file.h" #include "common.h" -#include "tts_server_threading_osx.h" - #include "index.html.hpp" +#include "tts_server_threading_osx.h" enum server_state { LOADING, // Server is starting up / model loading @@ -106,7 +107,7 @@ struct simple_server_task { task_type task; int id; std::string prompt; - generation_configuration * gen_config; + generation_configuration gen_config; void * response; size_t length; bool success = false; @@ -224,14 +225,15 @@ void init_response_map(simple_response_map * rmap) { struct worker { worker(struct simple_task_queue * task_queue, struct simple_response_map * response_map, std::string text_encoder_path = "", int task_timeout = 300): task_queue(task_queue), response_map(response_map), text_encoder_path(text_encoder_path), task_timeout(task_timeout) {}; ~worker() { - for (auto &[_, runner]: runners) { - delete runner; + // runners.clear(); + for (auto & runner : views::values(runners)) { + static_cast(!runner.release()); // TODO the destructor doesn't work yet } } struct simple_task_queue * task_queue; struct simple_response_map * response_map; - std::unordered_map runners; + unordered_map> runners{}; std::string text_encoder_path; std::atomic running = true; tts_server_threading::native_thread * thread = nullptr; @@ -255,17 +257,16 @@ struct worker { if (task->timed_out(task_timeout)) { return; } - int outcome; tts_response * data = nullptr; - tts_runner* runner = runners[task->model]; + tts_generation_runner & runner{*runners[task->model]}; switch(task->task) { case TTS: - data = new tts_response; - outcome = generate(runner, task->prompt, data, task->gen_config); - task->response = (void*) data->data; - task->length = data->n_outputs; - task->sample_rate = runner->sampling_rate; - task->success = outcome == 0; + data = new tts_response; + runner.generate(task->prompt.c_str(), *data, task->gen_config); + task->response = (void *) data->data; + task->length = data->n_outputs; + task->sample_rate = runner.sampling_rate; + task->success = data->n_outputs != 0; response_map->push(task); break; case CONDITIONAL_PROMPT: @@ -274,7 +275,7 @@ struct worker { response_map->push(task); break; } - update_conditional_prompt(runner, text_encoder_path, task->prompt); + runner.update_conditional_prompt(text_encoder_path.c_str(), task->prompt.c_str()); task->success = true; response_map->push(task); break; @@ -287,8 +288,8 @@ struct worker { if (!runner->supports_voices) { continue; } - std::string voices_string = ""; - for (auto voice : list_voices(runner)) { + std::string voices_string{}; + for (const auto voice : runner->list_voices()) { if (!voices_string.empty()) { voices_string += ","; } @@ -312,9 +313,9 @@ struct worker { } }; -void init_worker(std::unordered_map* model_path, int n_threads, bool cpu_only, generation_configuration * config, worker * w) { +void init_worker(std::unordered_map* model_path, int n_threads, bool cpu_only, const generation_configuration & config, worker * w) { for (const auto &[id, path] : *model_path) { - w->runners[id] = runner_from_file(path, n_threads, config, cpu_only); + w->runners[id] = runner_from_file(path.c_str(), n_threads, config, cpu_only); } w->loop(); } @@ -444,7 +445,7 @@ int main(int argc, const char ** argv) { exit(1); } - generation_configuration * default_generation_config = new generation_configuration( + const generation_configuration default_generation_config{ args.get_string_param("--voice"), *args.get_int_param("--topk"), *args.get_float_param("--temperature"), @@ -452,7 +453,7 @@ int main(int argc, const char ** argv) { !args.get_bool_param("--no-cross-attn"), args.get_string_param("--espeak-voice-id"), 0, - *args.get_float_param("--top-p")); + *args.get_float_param("--top-p")}; worker_pool * pool = nullptr; struct simple_task_queue * tqueue = new simple_task_queue; @@ -651,34 +652,33 @@ int main(int argc, const char ** argv) { } struct simple_server_task * task = new simple_server_task(TTS, prompt); int id = task->id; - generation_configuration * conf = new generation_configuration(); - std::memcpy((void*)conf, default_generation_config, sizeof(generation_configuration)); + generation_configuration conf{default_generation_config}; float temp; float rep_pen; float top_p; int top_k; if (data.contains("temperature") && data.at("temperature").is_number()) { temp = data.at("temperature").get(); - conf->temperature = temp; + conf.temperature = temp; } if (data.contains("top_k") && data.at("top_k").is_number()) { top_k = data.at("top_k").get(); - conf->top_k = top_k; + conf.top_k = top_k; } if (data.contains("top_p") && data.at("top_p").is_number()) { top_p = data.at("top_p").get(); - conf->top_p = top_p; + conf.top_p = top_p; } if (data.contains("repetition_penalty") && data.at("repetition_penalty").is_number()) { rep_pen = data.at("repetition_penalty").get(); - conf->repetition_penalty = rep_pen; + conf.repetition_penalty = rep_pen; } if (data.contains("voice") && data.at("voice").is_string()) { - conf->voice = data.at("voice").get(); + conf.voice = data.at("voice").get(); } if (data.contains("model") && data.at("model").is_string()) { diff --git a/ggml b/ggml index 136da02..70ba160 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit 136da02ac32d5011cf9b46b117a0ea1be24e2bad +Subproject commit 70ba16054447fb613d7c4fba76eeaf9ec0bfbeab diff --git a/ggml-patches/.clang-format-ignore b/ggml-patches/.clang-format-ignore new file mode 100644 index 0000000..42dea25 --- /dev/null +++ b/ggml-patches/.clang-format-ignore @@ -0,0 +1,2 @@ +llama-mmap.cpp +llama-mmap.h diff --git a/ggml-patches/CMakeLists.txt b/ggml-patches/CMakeLists.txt new file mode 100644 index 0000000..8672efc --- /dev/null +++ b/ggml-patches/CMakeLists.txt @@ -0,0 +1,6 @@ +target_sources(ggml PRIVATE + ggml-iterator.h + llama-mmap.cpp + llama-mmap.h +) +target_include_directories(ggml PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/ggml-patches/README.txt b/ggml-patches/README.txt new file mode 100644 index 0000000..8923ab8 --- /dev/null +++ b/ggml-patches/README.txt @@ -0,0 +1 @@ +TODO: upstream this diff --git a/ggml-patches/ggml-iterator.h b/ggml-patches/ggml-iterator.h new file mode 100644 index 0000000..5cd87f2 --- /dev/null +++ b/ggml-patches/ggml-iterator.h @@ -0,0 +1,64 @@ +#pragma once + +#include + +#include "ggml.h" + +class gguf_key_iterator { + const gguf_context * const ctx; + const int n_kv; + int i{}; + + public: + explicit gguf_key_iterator(const gguf_context & ctx) : ctx{ &ctx }, n_kv{ gguf_get_n_kv(&ctx) } {} + + std::pair operator*() const { return { i, gguf_get_key(ctx, i) }; } + + gguf_key_iterator & operator++() { + ++i; + return *this; + } + + gguf_key_iterator begin() const { + auto result{ *this }; + result.i = 0; + return result; + } + + gguf_key_iterator end() const { + auto result{ *this }; + result.i = n_kv; + return result; + } + + bool operator==(const gguf_key_iterator &) const = default; +}; + +class ggml_tensor_iterator { + const ggml_context * const ctx; + ggml_tensor * cur; + + public: + explicit ggml_tensor_iterator(const ggml_context & ctx) : ctx{ &ctx }, cur{ ggml_get_first_tensor(&ctx) } {} + + ggml_tensor & operator*() const { return *cur; } + + ggml_tensor_iterator & operator++() { + cur = ggml_get_next_tensor(ctx, cur); + return *this; + } + + ggml_tensor_iterator begin() const { + auto result{ *this }; + result.cur = ggml_get_first_tensor(ctx); + return result; + } + + ggml_tensor_iterator end() const { + auto result{ *this }; + result.cur = nullptr; + return result; + } + + bool operator==(const ggml_tensor_iterator &) const = default; +}; diff --git a/ggml-patches/llama-mmap.cpp b/ggml-patches/llama-mmap.cpp new file mode 100644 index 0000000..9a2f166 --- /dev/null +++ b/ggml-patches/llama-mmap.cpp @@ -0,0 +1,638 @@ +#include "llama-mmap.h" + +#include "ggml.h" +#include "../ggml/src/ggml-impl.h" + +#include +#include +#include +#include +#include + +#ifdef __has_include + #if __has_include() + #include + #if defined(_POSIX_MAPPED_FILES) + #include + #include + #endif + #if defined(_POSIX_MEMLOCK_RANGE) + #include + #endif + #endif +#endif + +#if defined(_WIN32) + #define WIN32_LEAN_AND_MEAN + #ifndef NOMINMAX + #define NOMINMAX + #endif + #include + #ifndef PATH_MAX + #define PATH_MAX MAX_PATH + #endif + #include +#endif + +#if defined(__APPLE__) +#include +#endif + +// TODO: consider moving to llama-impl.h if needed in more places +#if defined(_WIN32) +#include + +static std::string llama_format_win_err(DWORD err) { + LPSTR buf; + size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL); + if (!size) { + return "FormatMessageA failed"; + } + std::string ret(buf, size); + LocalFree(buf); + return ret; +} +#endif + +#define USE_MSYNC false + +// llama_file + +struct llama_file::impl { +#if defined(_WIN32) + HANDLE fp_win32; + std::string GetErrorMessageWin32(DWORD error_code) const { + std::string ret; + LPSTR lpMsgBuf = NULL; + DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL); + if (!bufLen) { + std::ostringstream ss; + ss << "Win32 error code: " << std::hex << error_code; + ret = ss.str(); + } else { + ret = lpMsgBuf; + LocalFree(lpMsgBuf); + } + + return ret; + } + + impl(const char * fname, const char * mode) { + fp = ggml_fopen(fname, mode); + if (fp == NULL) { + throw std::runtime_error(std::string{"failed to open "} + fname + ": " + strerror(errno)); + } + fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp)); + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + + size_t tell() const { + LARGE_INTEGER li; + li.QuadPart = 0; + BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT); + if (!ret) { + throw std::runtime_error(std::string{"read error: "} + GetErrorMessageWin32(GetLastError())); + } + + return li.QuadPart; + } + + void seek(size_t offset, int whence) const { + static_assert(SEEK_SET == FILE_BEGIN, "SEEK_SET != FILE_BEGIN"); + static_assert(SEEK_CUR == FILE_CURRENT, "SEEK_CUR != FILE_CURRENT"); + static_assert(SEEK_END == FILE_END, "SEEK_END != FILE_END"); + + LARGE_INTEGER li; + li.QuadPart = offset; + BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence); + if (!ret) { + throw std::runtime_error(std::string{"read error: "} + GetErrorMessageWin32(GetLastError())); + } + } + + void read_raw(void * ptr, size_t len) const { + size_t bytes_read = 0; + while (bytes_read < len) { + size_t chunk_size = std::min(len - bytes_read, 64*1024*1024); + DWORD chunk_read = 0; + BOOL result = ReadFile(fp_win32, reinterpret_cast(ptr) + bytes_read, chunk_size, &chunk_read, NULL); + if (!result) { + throw std::runtime_error(std::string{"read error: "} + GetErrorMessageWin32(GetLastError())); + } + if (chunk_read < chunk_size || chunk_read == 0) { + throw std::runtime_error("unexpectedly reached end of file"); + } + + bytes_read += chunk_read; + } + } + + uint32_t read_u32() const { + uint32_t val; + read_raw(&val, sizeof(val)); + return val; + } + + void write_raw(const void * ptr, size_t len) const { + size_t bytes_written = 0; + while (bytes_written < len) { + size_t chunk_size = std::min(len - bytes_written, 64*1024*1024); + DWORD chunk_written = 0; + BOOL result = WriteFile(fp_win32, reinterpret_cast(ptr) + bytes_written, chunk_size, &chunk_written, NULL); + if (!result) { + throw std::runtime_error(std::string{"write error: "} + GetErrorMessageWin32(GetLastError())); + } + if (chunk_written < chunk_size || chunk_written == 0) { + throw std::runtime_error("unexpectedly failed to write bytes"); + } + + bytes_written += chunk_written; + } + } + + void write_u32(uint32_t val) const { + write_raw(&val, sizeof(val)); + } + + ~impl() { + if (fp) { + std::fclose(fp); + } + } +#else + impl(const char * fname, const char * mode) { + fp = ggml_fopen(fname, mode); + if (fp == NULL) { + throw std::runtime_error(std::string{"failed to open "} + fname + ": " + strerror(errno)); + } + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + + size_t tell() const { +// TODO: this ifdef is never true? +#ifdef _WIN32 + __int64 ret = _ftelli64(fp); +#else + long ret = std::ftell(fp); +#endif + if (ret == -1) { + throw std::runtime_error(std::string{"ftell error: "} + strerror(errno)); + } + + return (size_t) ret; + } + + void seek(size_t offset, int whence) const { +// TODO: this ifdef is never true? +#ifdef _WIN32 + int ret = _fseeki64(fp, (__int64) offset, whence); +#else + int ret = std::fseek(fp, (long) offset, whence); +#endif + if (ret != 0) { + throw std::runtime_error(std::string{"seek error: "} + strerror(errno)); + } + } + + void read_raw(void * ptr, size_t len) const { + if (len == 0) { + return; + } + errno = 0; + std::size_t ret = std::fread(ptr, len, 1, fp); + if (ferror(fp)) { + throw std::runtime_error(std::string{"read error: "} + strerror(errno)); + } + if (ret != 1) { + throw std::runtime_error("unexpectedly reached end of file"); + } + } + + uint32_t read_u32() const { + uint32_t ret; + read_raw(&ret, sizeof(ret)); + return ret; + } + + void write_raw(const void * ptr, size_t len) const { + if (len == 0) { + return; + } + errno = 0; + size_t ret = std::fwrite(ptr, len, 1, fp); + if (ret != 1) { + throw std::runtime_error(std::string{"write error: "} + strerror(errno)); + } + } + + void write_u32(uint32_t val) const { + write_raw(&val, sizeof(val)); + } + + ~impl() { + if (fp) { + std::fclose(fp); + } + } +#endif + + FILE * fp; + size_t size; +}; + +llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique(fname, mode)) {} +llama_file::~llama_file() = default; + +size_t llama_file::tell() const { return pimpl->tell(); } +size_t llama_file::size() const { return pimpl->size; } + +int llama_file::file_id() const { +#ifdef _WIN32 + return _fileno(pimpl->fp); +#else +#if defined(fileno) + return fileno(pimpl->fp); +#else + return ::fileno(pimpl->fp); +#endif +#endif +} + +void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); } +void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); } + +uint32_t llama_file::read_u32() const { return pimpl->read_u32(); } + +void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); } +void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); } + +// llama_mmap + +struct llama_mmap::impl { +#ifdef _POSIX_MAPPED_FILES + std::vector> mapped_fragments; + + impl(struct llama_file * file, size_t prefetch, bool numa, bool writable) : writable{ writable } { + size = file->size(); + int fd = file->file_id(); + int flags = MAP_SHARED; + if (numa) { prefetch = 0; } +#ifdef __linux__ + if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) { + GGML_LOG_WARN("warning: posix_fadvise(.., POSIX_FADV_SEQUENTIAL) failed: %s\n", + strerror(errno)); + } +#endif + addr = mmap(NULL, file->size(), PROT_READ | (writable ? PROT_WRITE : 0), flags, fd, 0); + if (addr == MAP_FAILED) { + throw std::runtime_error(std::string{"mmap failed: "} + strerror(errno)); + } + + prefetch = std::min(file->size(), prefetch == ~0U ? file->size() : prefetch); + // if (madvise(addr, file->size(), MADV_HUGEPAGE)) { // Still does nothing + // GGML_LOG_WARN("warning: madvise(.., MADV_HUGEPAGE) failed: %s\n", + // strerror(errno)); + // } + if (prefetch && !writable) { + // MADV_POPULATE_WRITE is a pessimization +#ifdef __linux__ + if (madvise(addr, prefetch, MADV_POPULATE_READ)) { + GGML_LOG_WARN("warning: madvise(.., MADV_POPULATE_READ) failed: %s\n", + strerror(errno)); + } +#else + if (posix_madvise(addr, prefetch, POSIX_MADV_WILLNEED)) { + GGML_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n", + strerror(errno)); + } +#endif + } + if (numa) { + if (posix_madvise(addr, file->size(), POSIX_MADV_RANDOM)) { + GGML_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n", + strerror(errno)); + } + } + + mapped_fragments.emplace_back(0, file->size()); + } + + static void align_range(size_t * first, size_t * last, size_t page_size) { + size_t offset_in_page = *first & (page_size - 1); + size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page; + *first += offset_to_page; + + *last = *last & ~(page_size - 1); + + if (*last <= *first) { + *last = *first; + } + } + + void unmap_fragment(size_t first, size_t last) { + if (writable) { + return; + } + int page_size = sysconf(_SC_PAGESIZE); + align_range(&first, &last, page_size); + size_t len = last - first; + + if (len == 0) { + return; + } + + GGML_ASSERT(first % page_size == 0); + GGML_ASSERT(last % page_size == 0); + GGML_ASSERT(last > first); + + void * next_page_start = (uint8_t *) addr + first; + + if (munmap(next_page_start, len)) { + GGML_LOG_WARN("warning: munmap failed: %s\n", strerror(errno)); + } + + std::vector> new_mapped_fragments; + for (const auto & frag : mapped_fragments) { + if (frag.first < first && frag.second > last) { + new_mapped_fragments.emplace_back(frag.first, first); + new_mapped_fragments.emplace_back(last, frag.second); + } else if (frag.first < first && frag.second > first) { + new_mapped_fragments.emplace_back(frag.first, first); + } else if (frag.first < last && frag.second > last) { + new_mapped_fragments.emplace_back(last, frag.second); + } else if (frag.first >= first && frag.second <= last) { + } else { + new_mapped_fragments.push_back(frag); + } + } + mapped_fragments = std::move(new_mapped_fragments); + } + + ~impl() { + for (const auto & frag : mapped_fragments) { + if (writable) { + if (msync((char *) addr + frag.first, frag.second - frag.first, USE_MSYNC ? MS_SYNC : MS_ASYNC)) { + GGML_LOG_WARN("warning: msync failed: %s\n", strerror(errno)); + } + } + if (munmap((char *) addr + frag.first, frag.second - frag.first)) { + GGML_LOG_WARN("warning: munmap failed: %s\n", strerror(errno)); + } + } + } +#elif defined(_WIN32) + HANDLE hFile; + impl(struct llama_file * file, size_t prefetch, bool numa, bool writable) : writable{ writable } { + GGML_UNUSED(numa); + + size = file->size(); + + hFile = (HANDLE) _get_osfhandle(file->file_id()); + + HANDLE hMapping = CreateFileMappingA(hFile, NULL, writable ? PAGE_READWRITE : PAGE_READONLY, 0, 0, NULL); + + if (hMapping == NULL) { + DWORD error = GetLastError(); + throw std::runtime_error(std::string{"CreateFileMappingA failed: "} + llama_format_win_err(error).c_str()); + } + + addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); + DWORD error = GetLastError(); + CloseHandle(hMapping); + + if (addr == NULL) { + throw std::runtime_error(std::string{"MapViewOfFile failed: "} + llama_format_win_err(error).c_str()); + } + + if (prefetch > 0) { +#if _WIN32_WINNT >= 0x602 or true + BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG); + HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll"); + + pPrefetchVirtualMemory = (decltype(pPrefetchVirtualMemory))(void *) GetProcAddress(hKernel32, "PrefetchVirtualMemory"); + + if (pPrefetchVirtualMemory) { + WIN32_MEMORY_RANGE_ENTRY range; + range.VirtualAddress = addr; + range.NumberOfBytes = (SIZE_T) std::min(size, prefetch); + if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { + GGML_LOG_WARN("warning: PrefetchVirtualMemory failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } + } +#else + throw std::runtime_error("PrefetchVirtualMemory unavailable"); +#endif + } + } + + void unmap_fragment(size_t first, size_t last) { + GGML_UNUSED(first); + GGML_UNUSED(last); + } + + ~impl() { + if (writable && USE_MSYNC) { + if (!FlushViewOfFile(addr, 0)) { + GGML_LOG_WARN("warning: FlushViewOfFile failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } + if (!FlushFileBuffers(hFile)) { + GGML_LOG_WARN("warning: FlushFileBuffers failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } + } + if (!UnmapViewOfFile(addr)) { + GGML_LOG_WARN("warning: UnmapViewOfFile failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } + } +#else + impl(struct llama_file * file, size_t prefetch, bool numa, bool writable) { + GGML_UNUSED(file); + GGML_UNUSED(prefetch); + GGML_UNUSED(numa); + GGML_UNUSED(writable); + + throw std::runtime_error("mmap not supported"); + } + + void unmap_fragment(size_t first, size_t last) { + GGML_UNUSED(first); + GGML_UNUSED(last); + + throw std::runtime_error("mmap not supported"); + } +#endif + + void * addr; + size_t size; + bool writable; +}; + +llama_mmap::llama_mmap(struct llama_file * file, size_t prefetch, bool numa, bool writable) : pimpl(std::make_unique(file, prefetch, numa, writable)) {} +llama_mmap::~llama_mmap() = default; + +size_t llama_mmap::size() const { return pimpl->size; } +void * llama_mmap::addr() const { return pimpl->addr; } + +void llama_mmap::unmap_fragment(size_t first, size_t last) { pimpl->unmap_fragment(first, last); } + +#if defined(_POSIX_MEMLOCK_RANGE) || defined(_WIN32) +const bool llama_mmap::SUPPORTED = true; +#else +const bool llama_mmap::SUPPORTED = false; +#endif + +// llama_mlock + +struct llama_mlock::impl { +#ifdef _POSIX_MEMLOCK_RANGE + static size_t lock_granularity() { + return (size_t) sysconf(_SC_PAGESIZE); + } + + bool raw_lock(const void * addr, size_t size) const { + if (!mlock(addr, size)) { + return true; + } + +#ifdef __APPLE__ +#define MLOCK_SUGGESTION \ + "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \ + "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MEMLOCK (ulimit -l).\n" +#else +#define MLOCK_SUGGESTION \ + "Try increasing RLIMIT_MEMLOCK ('ulimit -l' as root).\n" +#endif + + char* errmsg = std::strerror(errno); + bool suggest = (errno == ENOMEM); +#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX) + // visionOS/tvOS dont't support RLIMIT_MEMLOCK + // Skip resource limit checks on visionOS/tvOS + suggest = false; +#else + struct rlimit lock_limit; + if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) { + suggest = false; + } + if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) { + suggest = false; + } +#endif + + GGML_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s", + size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : ""); + return false; + } + + static void raw_unlock(void * addr, size_t size) { + if (munlock(addr, size)) { + GGML_LOG_WARN("warning: failed to munlock buffer: %s\n", std::strerror(errno)); + } + } +#elif defined(_WIN32) + static size_t lock_granularity() { + SYSTEM_INFO si; + GetSystemInfo(&si); + return (size_t) si.dwPageSize; + } + + bool raw_lock(void * ptr, size_t len) const { + for (int tries = 1; ; tries++) { + if (VirtualLock(ptr, len)) { + return true; + } + if (tries == 2) { + GGML_LOG_WARN("warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n", + len, size, llama_format_win_err(GetLastError()).c_str()); + return false; + } + + SIZE_T min_ws_size, max_ws_size; + if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) { + GGML_LOG_WARN("warning: GetProcessWorkingSetSize failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + return false; + } + size_t increment = len + 1048576; + min_ws_size += increment; + max_ws_size += increment; + if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) { + GGML_LOG_WARN("warning: SetProcessWorkingSetSize failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + return false; + } + } + } + + static void raw_unlock(void * ptr, size_t len) { + if (!VirtualUnlock(ptr, len)) { + GGML_LOG_WARN("warning: failed to VirtualUnlock buffer: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } + } +#else + static size_t lock_granularity() { + return (size_t) 65536; + } + + bool raw_lock(const void * addr, size_t len) const { + GGML_LOG_WARN("warning: mlock not supported on this system\n"); + return false; + } + + static void raw_unlock(const void * addr, size_t len) {} +#endif + + impl() : addr(NULL), size(0), failed_already(false) {} + + void init(void * ptr) { + GGML_ASSERT(addr == NULL && size == 0); + addr = ptr; + } + + void grow_to(size_t target_size) { + GGML_ASSERT(addr); + if (failed_already) { + return; + } + size_t granularity = lock_granularity(); + target_size = (target_size + granularity - 1) & ~(granularity - 1); + if (target_size > size) { + if (raw_lock((uint8_t *) addr + size, target_size - size)) { + size = target_size; + } else { + failed_already = true; + } + } + } + + void * addr; + size_t size; + + bool failed_already; +}; + +llama_mlock::llama_mlock() : pimpl(std::make_unique()) {} +llama_mlock::~llama_mlock() = default; + +void llama_mlock::init(void * ptr) { pimpl->init(ptr); } +void llama_mlock::grow_to(size_t target_size) { pimpl->grow_to(target_size); } + +#if defined(_POSIX_MEMLOCK_RANGE) || defined(_WIN32) +const bool llama_mlock::SUPPORTED = true; +#else +const bool llama_mlock::SUPPORTED = false; +#endif + +size_t llama_path_max() { + return PATH_MAX; +} diff --git a/ggml-patches/llama-mmap.h b/ggml-patches/llama-mmap.h new file mode 100644 index 0000000..419579f --- /dev/null +++ b/ggml-patches/llama-mmap.h @@ -0,0 +1,68 @@ +#pragma once + +#include +#include +#include + +struct llama_file; +struct llama_mmap; +struct llama_mlock; + +using llama_files = std::vector>; +using llama_mmaps = std::vector>; +using llama_mlocks = std::vector>; + +struct llama_file { + llama_file(const char * fname, const char * mode); + ~llama_file(); + + size_t tell() const; + size_t size() const; + + int file_id() const; // fileno overload + + void seek(size_t offset, int whence) const; + + void read_raw(void * ptr, size_t len) const; + uint32_t read_u32() const; + + void write_raw(const void * ptr, size_t len) const; + void write_u32(uint32_t val) const; + +private: + struct impl; + std::unique_ptr pimpl; +}; + +struct llama_mmap { + llama_mmap(const llama_mmap &) = delete; + llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false, bool writable = false); + ~llama_mmap(); + + size_t size() const; + void * addr() const; + + void unmap_fragment(size_t first, size_t last); + + static const bool SUPPORTED; + +private: + struct impl; + std::unique_ptr pimpl; +}; + +struct llama_mlock { + llama_mlock(); + ~llama_mlock(); + + void init(void * ptr); + void grow_to(size_t target_size); + + static const bool SUPPORTED; + +private: + struct impl; + std::unique_ptr pimpl; +}; + +size_t llama_path_max(); diff --git a/include/common.h b/include/common.h index c3a1a1c..981b34c 100644 --- a/include/common.h +++ b/include/common.h @@ -1,11 +1,13 @@ -#ifndef common_h -#define common_h +#pragma once #include #include #include +#include #include +using namespace std; + // Using this simple struct as opposed to a common std::vector allows us to return the cpu buffer // pointer directly rather than copying the contents of the buffer to a predefined std::vector. struct tts_response { @@ -28,7 +30,7 @@ const std::map SUPPORTED_ARCHITECTURES = { { "orpheus", ORPHEUS_ARCH } }; -/// Given a map from keys to values, creates a new map from values to keys +/// Given a map from keys to values, creates a new map from values to keys template static std::map reverse_map(const std::map& m) { std::map r; @@ -43,10 +45,10 @@ const std::map ARCHITECTURE_NAMES = reverse_map(SUPPORTED struct generation_configuration { generation_configuration( std::string voice = "", - int top_k = 50, - float temperature = 1.0, - float repetition_penalty = 1.0, - bool use_cross_attn = true, + int top_k = 50, + float temperature = 1.0, + float repetition_penalty = 1.0, + bool use_cross_attn = true, std::string espeak_voice_id = "", int max_tokens = 0, float top_p = 1.0, @@ -64,17 +66,29 @@ struct generation_configuration { }; struct tts_runner { - tts_arch arch; struct ggml_context * ctx = nullptr; float sampling_rate = 44100.0f; bool supports_voices = false; - std::string arch_name() { - return ARCHITECTURE_NAMES.at(arch); - } + virtual ~tts_runner() = default; void init_build(std::vector* buf_compute_meta); void free_build(); }; -#endif +struct ggml_tensor; +struct tts_model_loader; +struct llama_mmap; + +struct tts_generation_runner : tts_runner { + const reference_wrapper loader; + unique_ptr buf; + explicit tts_generation_runner(const tts_model_loader & loader); + ~tts_generation_runner() override; + + virtual void assign_weight(const char * name, ggml_tensor & tensor) = 0; + virtual void prepare_post_load() = 0; + virtual vector list_voices(); + virtual void update_conditional_prompt(const char * file_path, const char * prompt); + virtual void generate(const char * sentence, tts_response & output, const generation_configuration & config) = 0; +}; diff --git a/include/tts.h b/include/tts.h deleted file mode 100644 index 30e98dc..0000000 --- a/include/tts.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef tts_h -#define tts_h - -#include "parler_model.h" -#include "kokoro_model.h" -#include "dia_model.h" -#include "orpheus_model.h" -#include -#include -#include - -struct tts_runner * parler_tts_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only); -struct tts_runner * kokoro_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only); -struct tts_runner * dia_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only); -struct tts_runner * orpheus_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only); -struct tts_runner * runner_from_file(const std::string & fname, int n_threads, generation_configuration * config, bool cpu_only = true); -int generate(tts_runner * runner, std::string sentence, struct tts_response * response, generation_configuration * config); -void update_conditional_prompt(tts_runner * runner, const std::string file_path, const std::string prompt, bool cpu_only = true); -std::vector list_voices(tts_runner * runner); - -struct quantization_params { - quantization_params(uint32_t n_threads, enum ggml_type quantize_type): n_threads(n_threads), quantize_type(quantize_type) {}; - uint32_t n_threads; - enum ggml_type quantize_type; // quantization type - bool quantize_output_heads = false; - bool quantize_text_embeddings = false; - bool quantize_cross_attn_kv = false; - bool convert_dac_to_f16 = false; - bool convert_non_quantizable_to_f16 = false; -}; - -void quantize_gguf(const std::string & ifile, const std::string & ofile, struct quantization_params * params); - -#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3d07940..77a1e2a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,45 +2,38 @@ if (WIN32) if (BUILD_SHARED_LIBS) set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) - endif() -endif() + endif () +endif () # TTS add_library(tts - ../include/tts.h - ../include/args.h - ../include/phonemizer.h - tts.cpp - tokenizer.cpp - sampler.cpp - parler_model.cpp - dac_model.cpp - util.cpp - args.cpp - t5_encoder_model.cpp - phonemizer.cpp - tts_model.cpp - kokoro_model.cpp - dia_model.cpp - orpheus_model.cpp - snac_model.cpp - general_neural_audio_codec.cpp - ) + ../include/args.h + tokenizer.cpp + sampler.cpp + sampler.h + util.cpp + util.h + args.cpp + tts_model.cpp + tts_model.h +) target_include_directories(tts PUBLIC . ../include ../ggml/src/) -target_compile_features (tts PUBLIC cxx_std_11) # don't bump - -if (ESPEAK_INCLUDE_DIRS) - set_source_files_properties(phonemizer.cpp PROPERTIES COMPILE_FLAGS "${ESPEAK_CFLAGS_OTHER}") - set_source_files_properties(phonemizer.cpp PROPERTIES INCLUDE_DIRECTORIES "${ESPEAK_INCLUDE_DIRS}") - target_link_libraries(tts PUBLIC ${ESPEAK_LIBRARIES}) -endif() target_link_libraries(tts PUBLIC ggml) if (BUILD_SHARED_LIBS) set_target_properties(tts PROPERTIES POSITION_INDEPENDENT_CODE ON) target_compile_definitions(tts PRIVATE LLAMA_BUILD) - target_compile_definitions(tts PUBLIC LLAMA_SHARED) -endif() + target_compile_definitions(tts PUBLIC LLAMA_SHARED) +endif () + +add_subdirectory(decoder) +add_subdirectory(models) + +if (ESPEAK_INCLUDE_DIRS) + set_source_files_properties(models/kokoro/phonemizer.cpp PROPERTIES COMPILE_FLAGS "${ESPEAK_CFLAGS_OTHER}") + set_source_files_properties(models/kokoro/phonemizer.cpp PROPERTIES INCLUDE_DIRECTORIES "${ESPEAK_INCLUDE_DIRS}") + target_link_libraries(tts PUBLIC ${ESPEAK_LIBRARIES}) +endif () diff --git a/src/decoder/CMakeLists.txt b/src/decoder/CMakeLists.txt new file mode 100644 index 0000000..ef05806 --- /dev/null +++ b/src/decoder/CMakeLists.txt @@ -0,0 +1,8 @@ +target_sources(tts PRIVATE + dac_model.cpp + dac_model.h + general_neural_audio_codec.cpp + general_neural_audio_codec.h + snac_model.cpp + snac_model.h +) diff --git a/src/dac_model.cpp b/src/decoder/dac_model.cpp similarity index 100% rename from src/dac_model.cpp rename to src/decoder/dac_model.cpp index 6685007..defce7e 100644 --- a/src/dac_model.cpp +++ b/src/decoder/dac_model.cpp @@ -1,4 +1,5 @@ #include "dac_model.h" + #include #include @@ -209,4 +210,3 @@ void dac_runner::run(uint32_t * input_tokens, uint32_t sequence_length, struct t outputs->n_outputs = sequence_length * model->up_sampling_factor; return; } - diff --git a/src/dac_model.h b/src/decoder/dac_model.h similarity index 99% rename from src/dac_model.h rename to src/decoder/dac_model.h index be43ad0..c74a095 100644 --- a/src/dac_model.h +++ b/src/decoder/dac_model.h @@ -1,9 +1,10 @@ #ifndef dac_model_h #define dac_model_h -#include "general_neural_audio_codec.h" #include +#include "general_neural_audio_codec.h" + enum dac_tensor { DAC_ENCODER_IN_KERNEL, DAC_ENCODER_IN_BIAS, diff --git a/src/general_neural_audio_codec.cpp b/src/decoder/general_neural_audio_codec.cpp similarity index 99% rename from src/general_neural_audio_codec.cpp rename to src/decoder/general_neural_audio_codec.cpp index 8f7893e..371afef 100644 --- a/src/general_neural_audio_codec.cpp +++ b/src/decoder/general_neural_audio_codec.cpp @@ -1,7 +1,8 @@ #include "general_neural_audio_codec.h" + #include -#include #include +#include namespace general_neural_audio_codec { // This contains a mapping between string names and gguf_tensor enum values for the purposes of assigning the weights from a gguf file diff --git a/src/general_neural_audio_codec.h b/src/decoder/general_neural_audio_codec.h similarity index 99% rename from src/general_neural_audio_codec.h rename to src/decoder/general_neural_audio_codec.h index 1ec0a42..7ea7977 100644 --- a/src/general_neural_audio_codec.h +++ b/src/decoder/general_neural_audio_codec.h @@ -1,6 +1,6 @@ #pragma once -#include "tts_model.h" +#include "../tts_model.h" // This namespace implements a general abstraction of the core functionality used in common neural audio codecs like DAC and SNAC. namespace general_neural_audio_codec { diff --git a/src/snac_model.cpp b/src/decoder/snac_model.cpp similarity index 100% rename from src/snac_model.cpp rename to src/decoder/snac_model.cpp diff --git a/src/snac_model.h b/src/decoder/snac_model.h similarity index 100% rename from src/snac_model.h rename to src/decoder/snac_model.h diff --git a/src/models/CMakeLists.txt b/src/models/CMakeLists.txt new file mode 100644 index 0000000..861ac5e --- /dev/null +++ b/src/models/CMakeLists.txt @@ -0,0 +1,9 @@ +target_sources(tts PRIVATE + loaders.cpp + loaders.h +) + +add_subdirectory(dia) +add_subdirectory(kokoro) +add_subdirectory(orpheus) +add_subdirectory(parler) diff --git a/src/models/dia/CMakeLists.txt b/src/models/dia/CMakeLists.txt new file mode 100644 index 0000000..2e86004 --- /dev/null +++ b/src/models/dia/CMakeLists.txt @@ -0,0 +1,5 @@ +target_sources(tts PRIVATE + loader.cpp + model.cpp + model.h +) diff --git a/src/models/dia/loader.cpp b/src/models/dia/loader.cpp new file mode 100644 index 0000000..7bad51e --- /dev/null +++ b/src/models/dia/loader.cpp @@ -0,0 +1,23 @@ +#include "../loaders.h" +#include "model.h" + +void dia_register() {} + +dia_model_loader::dia_model_loader() : tts_model_loader{ "dia" } {} + +unique_ptr dia_model_loader::from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, + int n_threads, bool cpu_only, + const generation_configuration & config) const { + dia_model * model = new dia_model; + dac_model * audio_model = new dac_model; + model->setup_from_file(meta_ctx, weight_ctx, cpu_only); + audio_model->setup_from_file(meta_ctx, weight_ctx, cpu_only); + sampler * samp = new sampler; + dac_context * dctx = build_new_dac_context(audio_model, n_threads, cpu_only); + dac_runner * audio_decoder = new dac_runner(audio_model, dctx); + dia_context * diactx = build_new_dia_context(model, n_threads, cpu_only); + dia_kv_cache * cache = new dia_kv_cache; + return make_unique(model, audio_decoder, diactx, samp, cache); +} + +const dia_model_loader dia_loader{}; diff --git a/src/dia_model.cpp b/src/models/dia/model.cpp similarity index 97% rename from src/dia_model.cpp rename to src/models/dia/model.cpp index bd6dfd4..72d4b38 100644 --- a/src/dia_model.cpp +++ b/src/models/dia/model.cpp @@ -1,4 +1,4 @@ -#include "dia_model.h" +#include "model.h" void dia_model::assign_weight(std::string name, struct ggml_tensor * tensor) { std::vector parts = split(name, "."); @@ -720,16 +720,6 @@ struct ggml_cgraph * dia_runner::build_dia_graph(dia_ubatch & batch) { return gf; } -void dia_runner::configure_generation(generation_configuration * config) { - GGML_ASSERT(config->max_tokens == 0 || config->max_tokens > model->max_delay); - decode_sampler->temperature = config->temperature; - decode_sampler->repetition_penalty = config->repetition_penalty; - decode_sampler->do_sample = config->sample; - decode_sampler->top_k = config->top_k; - decode_sampler->top_p = config->top_p; - dctx->max_generation_size = config->max_tokens > model->max_delay ? config->max_tokens : model->max_generation_size; -} - void dia_runner::set_inputs(dia_ubatch & batch) { if (batch.encoder_step) { ggml_backend_tensor_set(dctx->inp_tokens, batch.tokens.data(), 0, batch.tokens.size()*ggml_element_size(dctx->inp_tokens)); @@ -856,7 +846,7 @@ void dia_runner::adjust_output_tokens(std::vector & output_tokens, std } } -int dia_runner::generate_from_batch(dia_ubatch & batch, struct tts_response * output) { +int dia_runner::generate_from_batch(dia_ubatch & batch, tts_response & output) { while (!check_stopping(batch)) { int state = decode(batch); if (state != 0) { @@ -875,11 +865,19 @@ int dia_runner::generate_from_batch(dia_ubatch & batch, struct tts_response * ou std::vector filtered_output_tokens; adjust_output_tokens(dctx->output_tokens, filtered_output_tokens); - dac_runner->run(filtered_output_tokens.data(), (int32_t) filtered_output_tokens.size() / model->n_output_heads, output); + dac_runner->run(filtered_output_tokens.data(), (int32_t) filtered_output_tokens.size() / model->n_output_heads, &output); return 0; } -int dia_runner::generate(std::string sentence, struct tts_response * output) { +void dia_runner::generate(const char * sentence, tts_response & output, const generation_configuration & config) { + GGML_ASSERT(config.max_tokens == 0 || config.max_tokens > model->max_delay); + decode_sampler->temperature = config.temperature; + decode_sampler->repetition_penalty = config.repetition_penalty; + decode_sampler->do_sample = config.sample; + decode_sampler->top_k = config.top_k; + decode_sampler->top_p = config.top_p; + dctx->max_generation_size = config.max_tokens > model->max_delay ? config.max_tokens : model->max_generation_size; + dia_ubatch batch = batch_from_sentence(sentence); dctx->reset(); decode_sampler->reset(); @@ -887,25 +885,16 @@ int dia_runner::generate(std::string sentence, struct tts_response * output) { if (!kv_cross_self) { kv_cross_self = new dia_kv_cache; if (!dia_kv_cache_init(kv_cross_self, model, dctx)) { - return 1; + return; } } - return generate_from_batch(batch, output); + generate_from_batch(batch, output); } -void dia_runner::assign_weight(std::string name, ggml_tensor * tensor) { - if (tensor->data == NULL) { - return; - } - - if (name.size() == 0) { - // handles the top level meta tensor - return; - } - - if (name.size() > 14 && name.substr(0, 14) == "audio_encoder.") { - dac_runner->model->assign_weight(name.substr(14), tensor); +void dia_runner::assign_weight(const char * name, ggml_tensor & tensor) { + if (const string_view name_sv{ name }; name_sv.starts_with("audio_encoder.")) { + dac_runner->model->assign_weight(string{ name_sv.substr(sizeof("audio_encoder.") - 1) }, &tensor); } else { - model->assign_weight(name, tensor); - } + model->assign_weight(name, &tensor); + } } diff --git a/src/dia_model.h b/src/models/dia/model.h similarity index 89% rename from src/dia_model.h rename to src/models/dia/model.h index bdca91d..8a36f9c 100644 --- a/src/dia_model.h +++ b/src/models/dia/model.h @@ -1,7 +1,17 @@ #pragma once -#include "dac_model.h" -#include "sampler.h" +#include "../../decoder/dac_model.h" +#include "../../sampler.h" +#include "models/loaders.h" + +extern const struct dia_model_loader final : tts_model_loader { + explicit dia_model_loader(); + + unique_ptr from_file(gguf_context * meta_ctx, + ggml_context * weight_ctx, int n_threads, bool cpu_only, + const generation_configuration & config) const override; +} dia_loader; + struct dia_encoder_layer { struct ggml_tensor * k; @@ -165,8 +175,9 @@ static struct ggml_tensor * build_dia_decoder( ggml_cgraph * gf, ggml_context * // This struct is intended to support end-to-end TTS generation for the Dia model. As such, it manages Dia's model compilation, compute, generation, // tokenizationm and sampling process, and uses the dac_runner struct to encode audio outputs. -struct dia_runner : tts_runner { - dia_runner(dia_model * model, dac_runner * audio_decoder, dia_context * dctx, sampler * samp, dia_kv_cache * cache): model(model), dac_runner(audio_decoder), dctx(dctx), decode_sampler(samp), kv_cross_self(cache) { +struct dia_runner : tts_generation_runner { + dia_runner(dia_model * model, dac_runner * audio_decoder, dia_context * dctx, sampler * samp, dia_kv_cache * cache): + tts_generation_runner{dia_loader}, model(model), dac_runner(audio_decoder), dctx(dctx), decode_sampler(samp), kv_cross_self(cache) { decode_sampler->vocab_size = model->output_vocab_size; }; ~dia_runner() { @@ -192,15 +203,14 @@ struct dia_runner : tts_runner { void tokenize_sentence(std::string sentence, dia_ubatch & tokens); dia_ubatch batch_from_sentence(std::string sentence); - void configure_generation(generation_configuration * config); - void assign_weight(std::string name, ggml_tensor * tensor); + void assign_weight(const char * name, ggml_tensor & tensor) override; dia_ubatch build_worst_case_batch(); struct ggml_cgraph * build_dia_graph(dia_ubatch & batch); void set_inputs(dia_ubatch & batch); int decode(dia_ubatch & batch); - void prepare_post_load(); - int generate(std::string sentence, struct tts_response * response); + void prepare_post_load() override; + void generate(const char * sentence, tts_response & response, const generation_configuration & config) override; bool check_stopping(dia_ubatch & batch); void adjust_output_tokens(std::vector & output_tokens, std::vector & filtered); - int generate_from_batch(dia_ubatch & batch, struct tts_response * output); + int generate_from_batch(dia_ubatch & batch, tts_response & output); }; diff --git a/src/models/kokoro/CMakeLists.txt b/src/models/kokoro/CMakeLists.txt new file mode 100644 index 0000000..db438ca --- /dev/null +++ b/src/models/kokoro/CMakeLists.txt @@ -0,0 +1,7 @@ +target_sources(tts PRIVATE + loader.cpp + model.cpp + model.h + phonemizer.cpp + phonemizer.h +) diff --git a/src/models/kokoro/loader.cpp b/src/models/kokoro/loader.cpp new file mode 100644 index 0000000..3437cf1 --- /dev/null +++ b/src/models/kokoro/loader.cpp @@ -0,0 +1,27 @@ +#include "../loaders.h" +#include "model.h" + +void kokoro_register() {} + +kokoro_model_loader::kokoro_model_loader() : tts_model_loader{ "kokoro" } {} + +unique_ptr kokoro_model_loader::from_file( + gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, bool cpu_only, + const generation_configuration & config) const { + unique_ptr model = make_unique(); + single_pass_tokenizer * spt = single_pass_tokenizer_from_gguf(meta_ctx, "tokenizer.ggml.tokens"); + model->setup_from_file(meta_ctx, weight_ctx, cpu_only); + kokoro_duration_context * kdctx = build_new_duration_kokoro_context(&*model, n_threads, cpu_only); + auto * duration_runner = new kokoro_duration_runner(&*model, kdctx, spt); + kokoro_context * kctx = build_new_kokoro_context(&*model, n_threads, cpu_only); + // if an espeak voice id wasn't specifically set infer it from the kokoro voice, + // if it was override it, otherwise fallback to American English. + const char * espeak_voice_id{ config.espeak_voice_id.c_str() }; + if (!*espeak_voice_id) { + espeak_voice_id = get_espeak_id_from_kokoro_voice(config.voice); + } + phonemizer * phmzr = phonemizer_from_gguf(meta_ctx, espeak_voice_id); + return make_unique(move(model), kctx, spt, duration_runner, phmzr); +} + +const kokoro_model_loader kokoro_loader{}; diff --git a/src/kokoro_model.cpp b/src/models/kokoro/model.cpp similarity index 95% rename from src/kokoro_model.cpp rename to src/models/kokoro/model.cpp index 70f1972..72a0d36 100644 --- a/src/kokoro_model.cpp +++ b/src/models/kokoro/model.cpp @@ -1,4 +1,4 @@ -#include "kokoro_model.h" +#include "model.h" static struct ggml_tensor * build_albert_attn_mask(ggml_context * ctx, struct kokoro_duration_context *kctx, const kokoro_ubatch & batch) { kctx->attn_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, (int64_t) batch.n_tokens, (int64_t) batch.n_tokens); @@ -62,16 +62,16 @@ static struct ggml_tensor * build_lstm_run(ggml_context * ctx, ggml_cgraph * gf, int i = reversed ? sequence_length - 1 - index : index; struct ggml_tensor * I_cur = ggml_view_3d(ctx, I, I->ne[0], 1, I->ne[2], I->nb[0], I->nb[1], I->nb[1]*i); I_cur = ggml_sigmoid(ctx, ggml_add(ctx, I_cur, ggml_add(ctx, ggml_mul_mat(ctx, weights[1], h_0), biases[1]))); - + struct ggml_tensor * F_cur = ggml_view_3d(ctx, F, F->ne[0], 1, F->ne[2], F->nb[0], F->nb[1], F->nb[1]*i); F_cur = ggml_sigmoid(ctx, ggml_add(ctx, F_cur, ggml_add(ctx, ggml_mul_mat(ctx, weights[3], h_0), biases[3]))); - + struct ggml_tensor * G_cur = ggml_view_3d(ctx, G, G->ne[0], 1, G->ne[2], G->nb[0], G->nb[1], G->nb[1]*i); G_cur = ggml_tanh(ctx, ggml_add(ctx, G_cur, ggml_add(ctx, ggml_mul_mat(ctx, weights[5], h_0), biases[5]))); - + struct ggml_tensor * O_cur = ggml_view_3d(ctx, O, O->ne[0], 1, O->ne[2], O->nb[0], O->nb[1], O->nb[1]*i); O_cur = ggml_sigmoid(ctx, ggml_add(ctx, O_cur, ggml_add(ctx, ggml_mul_mat(ctx, weights[7], h_0), biases[7]))); - + c_0 = ggml_add(ctx, ggml_mul(ctx, F_cur, c_0), ggml_mul(ctx, I_cur, G_cur)); h_0 = ggml_mul(ctx, ggml_tanh(ctx, c_0), O_cur); @@ -210,7 +210,7 @@ static struct ggml_tensor * build_generator(ggml_context * ctx, kokoro_model * m cur = ggml_leaky_relu(ctx, cur, 0.1f, false); cur = ggml_add(ctx, ggml_conv_transpose_1d(ctx, generator->ups[i]->upsample_weight, ggml_cont(ctx, ggml_transpose(ctx, cur)), generator->ups[i]->stride, generator->ups[i]->padding, 1, 0, 1), generator->ups[i]->upsample_bias); if (i == generator->ups.size() - 1) { - // This is a hacky way of implementing the simple reflection padding used here. + // This is a hacky way of implementing the simple reflection padding used here. // In general, ggml should eventually be built to support expressive reflective padding but for such simple front padding this makes more sense. struct ggml_tensor * temp = ggml_cont(ctx, ggml_view_3d(ctx, cur, 1, cur->ne[1], cur->ne[2], cur->nb[1], cur->nb[2], cur->nb[0])); cur = ggml_concat(ctx, temp, cur, 0); @@ -232,8 +232,8 @@ static struct ggml_tensor * build_generator(ggml_context * ctx, kokoro_model * m cur = ggml_leaky_relu(ctx, cur, 0.01f, false); cur = ggml_add(ctx, ggml_conv_1d(ctx, generator->out_conv_weight, ggml_cont(ctx, ggml_transpose(ctx, cur)), 1, model->out_conv_padding, 1), generator->out_conv_bias); - struct ggml_tensor * spec = ggml_view_3d(ctx, cur, cur->ne[0], model->post_n_fft, cur->ne[2], cur->nb[1], cur->nb[2], 0); - struct ggml_tensor * phase = ggml_view_3d(ctx, cur, cur->ne[0], cur->ne[1] - model->post_n_fft, cur->ne[2], cur->nb[1], cur->nb[2], cur->nb[1] * model->post_n_fft); + struct ggml_tensor * spec = ggml_view_3d(ctx, cur, cur->ne[0], model->post_n_fft, cur->ne[2], cur->nb[1], cur->nb[2], 0); + struct ggml_tensor * phase = ggml_view_3d(ctx, cur, cur->ne[0], cur->ne[1] - model->post_n_fft, cur->ne[2], cur->nb[1], cur->nb[2], cur->nb[1] * model->post_n_fft); phase = ggml_sin(ctx, phase); spec = ggml_exp(ctx, spec); @@ -385,7 +385,7 @@ void kokoro_model::post_load_assign() { sampling_factor_scalar->data = (void *)((uint8_t *) ggml_backend_buffer_get_base(buf) + offset); size_t scsize = ggml_nbytes(sampling_factor_scalar); // while it might appear that the upsampling_rate could be used here, the interpolation rate (i.e. the upsampling scale) is actually independent in the kokoro model implementation. - float sample_scalar = upsample_scale*2.0f*M_PI; + float sample_scalar = upsample_scale*2.0f*M_PI; ggml_backend_tensor_set(sampling_factor_scalar, &sample_scalar, 0, scsize); offset += scsize; post_load_tensor_bytes = 300 + offset - original_offset; @@ -410,24 +410,20 @@ void kokoro_model::assign_lstm(lstm * rnn, std::string name, ggml_tensor * tenso } } -void kokoro_model::assign_weight(std::string name, ggml_tensor * tensor) { - // all kokoro tensors are prepended by "kokoro" so lets trim that off and assign based on the module - std::vector parts = split(name, "."); - if (parts.size() < 2) { - return; // handle the null context tensor; - } - if (parts[1] == "albert") { - assign_albert_weight(name.substr(7+parts[1].size()+1), tensor); - } else if (parts[1] == "duration_predictor") { - assign_duration_weight(name.substr(7+parts[1].size()+1), tensor); - } else if (parts[1] == "text_encoder") { - assign_text_encoder_weight(name.substr(7+parts[1].size()+1), tensor); - } else if (parts[1] == "decoder") { - assign_decoder_weight(name.substr(7+parts[1].size()+1), tensor); - } else if (parts[1] == "voice_tensors") { - voices[parts[2]] = ggml_dup_tensor(ctx, tensor); - set_tensor(voices[parts[2]], tensor); - } +void kokoro_model::assign_weight(const char * name, ggml_tensor & tensor) { + if (const string_view name_sv{ name }; name_sv.starts_with("albert.")) { + assign_albert_weight(string{ name_sv.substr(sizeof("albert.") - 1) }, &tensor); + } else if (name_sv.starts_with("duration_predictor.")) { + assign_duration_weight(string{ name_sv.substr(sizeof("duration_predictor.") - 1) }, &tensor); + } else if (name_sv.starts_with("text_encoder.")) { + assign_text_encoder_weight(string{ name_sv.substr(sizeof("text_encoder.") - 1) }, &tensor); + } else if (name_sv.starts_with("decoder.")) { + assign_decoder_weight(string{ name_sv.substr(sizeof("decoder.") - 1) }, &tensor); + } else if (name_sv.starts_with("voice_tensors.")) { + const string voice{ name_sv.substr(sizeof("voice_tensors.") - 1) }; + voices[voice] = ggml_dup_tensor(ctx, &tensor); + set_tensor(voices[voice], &tensor); + } } void kokoro_model::assign_generator_weight(kokoro_generator * generator, std::string name, ggml_tensor * tensor) { @@ -484,7 +480,7 @@ void kokoro_model::assign_gen_resblock(kokoro_generator_residual_block * block, set_tensor(block->adain1d_1_gamma_biases[i], tensor); } else if (parts[1] == "gamma2_bias") { block->adain1d_2_gamma_biases[i] = ggml_dup_tensor(ctx, tensor); - set_tensor(block->adain1d_2_gamma_biases[i], tensor); + set_tensor(block->adain1d_2_gamma_biases[i], tensor); } else if (parts[1] == "beta1_weight") { block->adain1d_1_beta_weights[i] = ggml_dup_tensor(ctx, tensor); set_tensor(block->adain1d_1_beta_weights[i], tensor); @@ -496,7 +492,7 @@ void kokoro_model::assign_gen_resblock(kokoro_generator_residual_block * block, set_tensor(block->adain1d_1_beta_biases[i], tensor); } else if (parts[1] == "beta2_bias") { block->adain1d_2_beta_biases[i] = ggml_dup_tensor(ctx, tensor); - set_tensor(block->adain1d_2_beta_biases[i], tensor); + set_tensor(block->adain1d_2_beta_biases[i], tensor); } else if (parts[1] == "convs1_weight") { block->convs1_weights[i] = ggml_dup_tensor(ctx, tensor); set_tensor(block->convs1_weights[i], tensor); @@ -508,13 +504,13 @@ void kokoro_model::assign_gen_resblock(kokoro_generator_residual_block * block, set_tensor(block->convs1_biases[i], tensor); } else if (parts[1] == "convs2_bias") { block->convs2_biases[i] = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor)); - set_tensor(block->convs2_biases[i], tensor); + set_tensor(block->convs2_biases[i], tensor); } else if (parts[1] == "alpha1") { block->input_alphas[i] = ggml_dup_tensor(ctx, tensor); set_tensor(block->input_alphas[i], tensor); } else if (parts[1] == "alpha2") { block->output_alphas[i] = ggml_dup_tensor(ctx, tensor); - set_tensor(block->output_alphas[i], tensor); + set_tensor(block->output_alphas[i], tensor); } } @@ -540,7 +536,7 @@ void kokoro_model::assign_ada_res_block(ada_residual_conv_block * block, std::st set_tensor(block->norm1_gamma_bias, tensor); } else if (name == "norm2_gamma_bias") { block->norm2_gamma_bias = ggml_dup_tensor(ctx, tensor); - set_tensor(block->norm2_gamma_bias, tensor); + set_tensor(block->norm2_gamma_bias, tensor); } else if (name == "norm1_beta_weight") { block->norm1_beta = ggml_dup_tensor(ctx, tensor); set_tensor(block->norm1_beta, tensor); @@ -552,7 +548,7 @@ void kokoro_model::assign_ada_res_block(ada_residual_conv_block * block, std::st set_tensor(block->norm1_beta_bias, tensor); } else if (name == "norm2_beta_bias") { block->norm2_beta_bias = ggml_dup_tensor(ctx, tensor); - set_tensor(block->norm2_beta_bias, tensor); + set_tensor(block->norm2_beta_bias, tensor); } else if (name == "conv1_weight") { block->conv1 = ggml_dup_tensor(ctx, tensor); set_tensor(block->conv1, tensor); @@ -564,20 +560,20 @@ void kokoro_model::assign_ada_res_block(ada_residual_conv_block * block, std::st set_tensor(block->conv1_bias, tensor); } else if (name == "conv2_bias") { block->conv2_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor)); - set_tensor(block->conv2_bias, tensor); + set_tensor(block->conv2_bias, tensor); } else if (name == "pool_weight") { block->pool = ggml_dup_tensor(ctx, tensor); set_tensor(block->pool, tensor); } else if (name == "pool_bias") { block->pool_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor)); - set_tensor(block->pool_bias, tensor); + set_tensor(block->pool_bias, tensor); } else if (name == "conv1x1_weight") { tensor = squeeze_3d_2d_e0(ctx, tensor); block->upsample = ggml_dup_tensor(ctx, tensor); set_tensor(block->upsample, tensor); } else if (name == "conv1x1_bias") { block->upsample_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor)); - set_tensor(block->upsample_bias, tensor); + set_tensor(block->upsample_bias, tensor); } } @@ -853,7 +849,7 @@ void kokoro_model::prep_constants(gguf_context * meta) { if (vocab_size_key != -1) { vocab_size = gguf_get_val_u32(meta, vocab_size_key); } - + int hidden_size_key = gguf_find_key(meta, "kokoro.duration_predictor.albert.hidden_size"); if (hidden_size_key != -1) { hidden_size = gguf_get_val_u32(meta, hidden_size_key); @@ -967,7 +963,7 @@ struct ggml_cgraph * kokoro_duration_runner::build_kokoro_duration_graph(kokoro_ cur = inpL; struct ggml_tensor * KQ_mask_dec = build_albert_attn_mask(ctx, kctx, batch); - + for (int r = 0; r < model->n_recurrence; r++) { for (int l = 0; l < model->n_layers; l++) { struct ggml_tensor * residual = cur ; @@ -1046,7 +1042,7 @@ struct ggml_cgraph * kokoro_duration_runner::build_kokoro_duration_graph(kokoro_ ggml_build_forward_expand(gf, len); free_build(); - + return gf; } @@ -1087,7 +1083,7 @@ void kokoro_duration_runner::run(kokoro_ubatch & batch) { prev_size = kctx->buf_len_output ? ggml_backend_buffer_get_size(kctx->buf_len_output) : 0; new_size = model->max_context_length * sizeof(float); - + if (!kctx->buf_len_output || prev_size < new_size) { if (kctx->buf_output) { ggml_backend_buffer_free(kctx->buf_len_output); @@ -1097,22 +1093,22 @@ void kokoro_duration_runner::run(kokoro_ubatch & batch) { kctx->buf_len_output = ggml_backend_buft_alloc_buffer(kctx->backend_cpu_buffer, new_size); } - - + + batch.resp->hidden_states = (float *) ggml_backend_buffer_get_base(kctx->buf_output); ggml_backend_buffer_clear(kctx->buf_output, 0); batch.resp->lengths = (float *) ggml_backend_buffer_get_base(kctx->buf_len_output); ggml_backend_buffer_clear(kctx->buf_len_output, 0); - + struct ggml_cgraph * gf = NULL; gf = build_kokoro_duration_graph(batch); - + // the output is always the last tensor in the graph struct ggml_tensor * lens = gf->nodes[gf->n_nodes - 1]; // the reused duration hidden states are computed before a node chunk which has a size that is sequence length dependent struct ggml_tensor * hidden_states = gf->nodes[gf->n_nodes - 22 - 52 * batch.n_tokens]; ggml_backend_sched_alloc_graph(kctx->sched, gf); - + set_inputs(batch); ggml_backend_sched_graph_compute_async(kctx->sched, gf); @@ -1192,7 +1188,7 @@ struct ggml_cgraph * kokoro_runner::build_kokoro_graph(kokoro_ubatch & batch) { n = ggml_add(ctx, n, model->prosody_pred->n_proj_bias); ggml_set_name(n, "n_out"); ggml_build_forward_expand(gf, n); - + // kokoro text encoding; struct ggml_tensor * asr; //struct ggml_tensor * embd; @@ -1210,7 +1206,7 @@ struct ggml_cgraph * kokoro_runner::build_kokoro_graph(kokoro_ubatch & batch) { asr = ggml_mul_mat(ctx, ggml_cont(ctx, ggml_transpose(ctx, cur)), ggml_cont(ctx, ggml_transpose(ctx, kctx->duration_mask))); } - // decoding and generation prep + // decoding and generation prep struct ggml_tensor * asr_res; struct ggml_tensor * f0; struct ggml_tensor * n_base; @@ -1239,7 +1235,7 @@ struct ggml_cgraph * kokoro_runner::build_kokoro_graph(kokoro_ubatch & batch) { ggml_set_input(kctx->window_sq_sum); // run generation - cur = build_generator(ctx, model, kctx, cur, style_half2, f0_curve, model->decoder->generator, (int)kctx->sequence_length, kctx->window_sq_sum, gf); + cur = build_generator(ctx, &*model, kctx, cur, style_half2, f0_curve, model->decoder->generator, (int)kctx->sequence_length, kctx->window_sq_sum, gf); ggml_build_forward_expand(gf, cur); free_build(); return gf; @@ -1277,7 +1273,7 @@ void kokoro_runner::set_inputs(kokoro_ubatch & batch, uint32_t total_size) { } } -void kokoro_runner::run(kokoro_ubatch & batch, tts_response * outputs) { +void kokoro_runner::run(kokoro_ubatch & batch, tts_response & outputs) { batch.resp = new kokoro_duration_response; drunner->run(batch); @@ -1299,7 +1295,7 @@ void kokoro_runner::run(kokoro_ubatch & batch, tts_response * outputs) { kctx->buf_output = ggml_backend_buft_alloc_buffer(kctx->backend_cpu_buffer, new_size); } - outputs->data = (float *) ggml_backend_buffer_get_base(kctx->buf_output); + outputs.data = (float *) ggml_backend_buffer_get_base(kctx->buf_output); ggml_backend_buffer_clear(kctx->buf_output, 0); kctx->sequence_length = batch.n_tokens; @@ -1307,34 +1303,37 @@ void kokoro_runner::run(kokoro_ubatch & batch, tts_response * outputs) { struct ggml_cgraph * gf = NULL; gf = build_kokoro_graph(batch); - + // the output is always the last tensor in the graph struct ggml_tensor * output = gf->nodes[gf->n_nodes - 1]; ggml_backend_sched_alloc_graph(kctx->sched, gf); - + set_inputs(batch, total_length); ggml_backend_sched_graph_compute_async(kctx->sched, gf); - kctx->get_ggml_node_data(output, outputs->data, new_size); + kctx->get_ggml_node_data(output, outputs.data, new_size); // Reset state for the next token before backend sync, to allow the CPU activities in the reset to // overlap with device computation. ggml_backend_sched_reset(kctx->sched); - outputs->n_outputs = total_length*model->up_sampling_factor; + outputs.n_outputs = total_length*model->up_sampling_factor; free(batch.resp); return; } -void kokoro_runner::assign_weight(std::string name, ggml_tensor * tensor) { - model->assign_weight(name, tensor); +void kokoro_runner::assign_weight(const char * name, ggml_tensor & tensor) { + const string_view name_sv{ name }; + GGML_ASSERT(name_sv.starts_with("kokoro.")); + const string trimmed{ name_sv.substr(sizeof("kokoro.") - 1) }; + model->assign_weight(trimmed.c_str(), tensor); } /* * #tokenize_chunks is used to split up a larger than max context size (512) token prompt into discrete * blocks for generation. This solution, in accordance with Kokoro's pyTorch implementation, splits - * the prompt by sentence when possible (this can result in slower inference but generally produces cleaner + * the prompt by sentence when possible (this can result in slower inference but generally produces cleaner * speech). If a disinct sentence is too long, then it splits at the nearest space. */ std::vector> kokoro_runner::tokenize_chunks(std::vector clauses) { @@ -1343,7 +1342,7 @@ std::vector> kokoro_runner::tokenize_chunks(std::vector tokens; tokens.push_back(model->bos_token_id); tokenizer->tokenize(clause, tokens); @@ -1387,33 +1386,35 @@ std::vector> kokoro_runner::tokenize_chunks(std::vectorvoices.find(voice) == model->voices.end()) { - TTS_ABORT("Failed to find Kokoro voice '%s' aborting.\n", voice.c_str()); +void kokoro_runner::generate(const char * prompt, tts_response & response, const generation_configuration & config) { + if (model->voices.find(config.voice) == model->voices.end()) { + TTS_ABORT("Failed to find Kokoro voice '%s' aborting.\n", config.voice.c_str()); } else { // if the language changed then we should change the phonemization voice - if (phmzr->mode == ESPEAK && kctx->voice[0] != voice[0]) { + if (phmzr->mode == ESPEAK && kctx->voice[0] != config.voice[0]) { + std::string voice_code{config.espeak_voice_id}; if (voice_code.empty()) { - voice_code = get_espeak_id_from_kokoro_voice(voice); + voice_code = get_espeak_id_from_kokoro_voice(config.voice); } update_voice(voice_code); } - kctx->voice = voice; - drunner->kctx->voice = voice; + kctx->voice = config.voice; + drunner->kctx->voice = config.voice; } // replace all non-sentence terminating characters with '--' which espeak will treat as a pause. // We preserve the other punctuation for cleaner chunking pre-tokenization - prompt = replace_any(prompt, ",;:", "--"); - prompt = replace_any(prompt, "\n", " "); - std::string phonemized_prompt = phmzr->text_to_phonemes(prompt); + std::string normalized{prompt}; + normalized = replace_any(prompt, ",;:", "--"); + normalized = replace_any(prompt, "\n", " "); + std::string phonemized_prompt = phmzr->text_to_phonemes(normalized); - // Kokoro users a utf-8 single character tokenizer so if the size of the prompt is smaller than the max context length without the + // Kokoro users a utf-8 single character tokenizer so if the size of the prompt is smaller than the max context length without the // beginning of sentence and end of sentence tokens then we can compute it all at once. - if (phonemized_prompt.size() < model->max_context_length - 2) { + if (phonemized_prompt.size() < model->max_context_length - 2) { // we preserved punctuation and Kokoro interprets these tokens as end of sentence tokens, so we have to remove them for all-at-once compute. phonemized_prompt = strip(replace_any(phonemized_prompt, ".!?", "")); if (phonemized_prompt.empty()) { - return 0; + return; } std::vector tokens; tokens.push_back(model->bos_token_id); @@ -1425,32 +1426,26 @@ int kokoro_runner::generate(std::string prompt, struct tts_response * response, run(batch, response); } else { // TODO: determine the performance to memory trade off in using a batched compute approach verse this chunking approach. - // This approach is likely to be slower than a batched approach, but given the already huge memory overhead of Kokoro's graph it + // This approach is likely to be slower than a batched approach, but given the already huge memory overhead of Kokoro's graph it // might be preferable to use this chunking approach. std::vector clauses = split(phonemized_prompt, ".!?"); for (auto tokens : tokenize_chunks(clauses)) { kokoro_ubatch batch; batch.n_tokens = tokens.size(); batch.input_tokens = tokens.data(); - struct tts_response * partial = new tts_response; + tts_response partial{}; run(batch, partial); append_to_response(response, partial); } } - return 0; } -std::vector kokoro_runner::list_voices() { - std::vector voices; - voices.reserve(model->voices.size()); - for (auto voice : model->voices) { - voices.push_back(voice.first); - } - return voices; +std::vector kokoro_runner::list_voices() { + const auto voices{ views::keys(model->voices) | views::transform([](const auto & x) { return string_view{ x }; }) }; + return std::vector(cbegin(voices), cend(voices)); } - -std::string get_espeak_id_from_kokoro_voice(std::string voice) { +const char * get_espeak_id_from_kokoro_voice(std::string voice) { return !voice.empty() && KOKORO_LANG_TO_ESPEAK_ID.find(voice[0]) != KOKORO_LANG_TO_ESPEAK_ID.end() ? KOKORO_LANG_TO_ESPEAK_ID[voice[0]] : "gmw/en-US"; } diff --git a/src/kokoro_model.h b/src/models/kokoro/model.h similarity index 91% rename from src/kokoro_model.h rename to src/models/kokoro/model.h index b4f4f96..eed74aa 100644 --- a/src/kokoro_model.h +++ b/src/models/kokoro/model.h @@ -1,15 +1,23 @@ -#ifndef kokoro_model_h -#define kokoro_model_h +#pragma once -#include -#include "tts_model.h" -#include "tokenizer.h" +#include + +#include "../../tokenizer.h" +#include "../../tts_model.h" +#include "models/loaders.h" #include "phonemizer.h" +extern const struct kokoro_model_loader final : tts_model_loader { + explicit kokoro_model_loader(); + + unique_ptr from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, + bool cpu_only, const generation_configuration & config) const override; +} kokoro_loader; + // Rather than using ISO 639-2 language codes, Kokoro voice pack specify their corresponding language via their first letter. // Below is a map that describes the relationship between those designations and espeak-ng's voice identifiers so that the // appropriate phonemization protocol can inferred from the Kokoro voice. -static std::map KOKORO_LANG_TO_ESPEAK_ID = { +static std::map KOKORO_LANG_TO_ESPEAK_ID = { {'a', "gmw/en-US"}, {'b', "gmw/en"}, {'e', "roa/es"}, @@ -283,7 +291,7 @@ struct kokoro_model : tts_model { void post_load_assign(); - void assign_weight(std::string name, ggml_tensor * tensor); + void assign_weight(const char * name, ggml_tensor & tensor); void prep_layers(gguf_context * meta); void prep_constants(gguf_context * meta); void setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only = true) { @@ -344,9 +352,9 @@ static struct ggml_tensor * build_kokoro_generator_res_block(ggml_context * ctx, static struct ggml_tensor * build_noise_block(ggml_context * ctx, kokoro_noise_residual_block * block, struct ggml_tensor * x, struct ggml_tensor * style); static kokoro_generator_residual_block * build_res_block_from_file(gguf_context * meta, std::string base_config_key); static kokoro_noise_residual_block * build_noise_block_from_file(gguf_context * meta, int index); -static kokoro_generator_upsample_block* kokoro_generator_upsample_block(gguf_context * meta, int index); +static kokoro_generator_upsample_block * kokoro_generator_upsample_block(gguf_context * meta, int index); -std::string get_espeak_id_from_kokoro_voice(std::string voice); +const char * get_espeak_id_from_kokoro_voice(std::string voice); struct kokoro_duration_context * build_new_duration_kokoro_context(struct kokoro_model * model, int n_threads, bool use_cpu = true); struct kokoro_duration_response { @@ -421,8 +429,8 @@ static struct ggml_tensor * build_sin_gen(ggml_context * ctx, kokoro_model * mod struct kokoro_context * build_new_kokoro_context(struct kokoro_model * model, int n_threads, bool use_cpu = true); // This manages the graph compilation of computation for the Kokoro model. -struct kokoro_runner : tts_runner { - kokoro_runner(kokoro_model * model, kokoro_context * context, single_pass_tokenizer * tokenizer, kokoro_duration_runner * drunner, phonemizer * phmzr): model(model), kctx(context), tokenizer(tokenizer), drunner(drunner), phmzr(phmzr) { +struct kokoro_runner : tts_generation_runner { + kokoro_runner(unique_ptr model, kokoro_context * context, single_pass_tokenizer * tokenizer, kokoro_duration_runner * drunner, phonemizer * phmzr): tts_generation_runner{kokoro_loader}, model{move(model)}, kctx(context), tokenizer(tokenizer), drunner(drunner), phmzr(phmzr) { tts_runner::sampling_rate = 24000.0f; tts_runner::supports_voices = true; }; @@ -432,12 +440,11 @@ struct kokoro_runner : tts_runner { } delete drunner; model->free(); - delete model; delete kctx; delete phmzr; } struct single_pass_tokenizer * tokenizer; - kokoro_model * model; + unique_ptr model; kokoro_context * kctx; kokoro_duration_runner * drunner; phonemizer * phmzr; @@ -448,15 +455,13 @@ struct kokoro_runner : tts_runner { tts_runner::init_build(&kctx->buf_compute_meta); } - std::vector list_voices(); + std::vector list_voices() override; std::vector> tokenize_chunks(std::vector clauses); - void assign_weight(std::string name, ggml_tensor * tensor); + void assign_weight(const char * name, ggml_tensor & tensor); void prepare_post_load(); kokoro_ubatch build_worst_case_batch(); void set_inputs(kokoro_ubatch & batch, uint32_t total_size); struct ggml_cgraph * build_kokoro_graph(kokoro_ubatch & batch); - void run(kokoro_ubatch & batch, struct tts_response * outputs); - int generate(std::string prompt, struct tts_response * response, std::string voice, std::string voice_code = ""); + void run(kokoro_ubatch & batch, tts_response & outputs); + void generate(const char * prompt, tts_response & response, const generation_configuration & config); }; - -#endif diff --git a/src/phonemizer.cpp b/src/models/kokoro/phonemizer.cpp similarity index 100% rename from src/phonemizer.cpp rename to src/models/kokoro/phonemizer.cpp diff --git a/include/phonemizer.h b/src/models/kokoro/phonemizer.h similarity index 100% rename from include/phonemizer.h rename to src/models/kokoro/phonemizer.h diff --git a/src/models/loaders.cpp b/src/models/loaders.cpp new file mode 100644 index 0000000..ace14de --- /dev/null +++ b/src/models/loaders.cpp @@ -0,0 +1,84 @@ +#include "loaders.h" + +#include +#include + +#include "common.h" +#include "ggml-iterator.h" +#include "ggml.h" +#include "llama-mmap.h" + +static unordered_map> LOADERS; + +tts_model_loader::tts_model_loader(const char * arch) : arch{ arch } { + LOADERS.emplace(arch, ref(*this)); +} + +void dia_register(); +void kokoro_register(); +void orpheus_register(); +void parler_register(); + +[[maybe_unused]] static bool loaders = [] { + dia_register(); + kokoro_register(); + orpheus_register(); + parler_register(); + return true; +}(); + +// currently only metal and cpu devices are supported, +// so cpu_only only describes whether or not to try to load and run on metal. +unique_ptr runner_from_file(const char * fname, int n_threads, + const generation_configuration & config, bool cpu_only) { + static const bool use_mmap{ !getenv("OLLAMA_NO_MMAP") }; // TODO(danielzgtg) temporary, will be --no-mmap later + unique_ptr in_mmap{}; + if (use_mmap) { + llama_file in_map_file{ fname, "r" }; + in_mmap = make_unique(&in_map_file); + } + ggml_context * weight_ctx{}; + gguf_context * meta_ctx = gguf_init_from_file(fname, { + .no_alloc{ use_mmap }, + .ctx{ &weight_ctx }, + }); + if (!meta_ctx) { + GGML_ABORT("gguf_init_from_file failed for file %s\n", fname); + } + if (use_mmap) { + const int n{ gguf_get_n_tensors(&*meta_ctx) }; + int i{}; + void * in_buffer{ static_cast(in_mmap->addr()) + gguf_get_data_offset(meta_ctx) }; + for (ggml_tensor & cur : ggml_tensor_iterator{ *weight_ctx }) { + GGML_ASSERT(i < n); + GGML_ASSERT(!strcmp(cur.name, gguf_get_tensor_name(&*meta_ctx, i))); + cur.data = static_cast(in_buffer) + gguf_get_tensor_offset(&*meta_ctx, i); + ++i; + } + } + const int arch_key = gguf_find_key(meta_ctx, "general.architecture"); + const char * const arch{ gguf_get_val_str(meta_ctx, arch_key) }; + const auto found = LOADERS.find(arch); + if (found == LOADERS.end()) { + GGML_ABORT("Unknown architecture %s\n", arch); + } + const auto & loader{ found->second.get() }; + unique_ptr runner{ loader.from_file(meta_ctx, weight_ctx, n_threads, cpu_only, config) }; + // TODO(mmwillet): change this weight assignment pattern to mirror llama.cpp + for (ggml_tensor & cur : ggml_tensor_iterator{ *weight_ctx }) { + if (!cur.data) { + continue; + } + if (!*cur.name) { + // handles the top level meta tensor + continue; + } + runner->assign_weight(cur.name, cur); + } + runner->prepare_post_load(); + gguf_free(meta_ctx); + ggml_free(weight_ctx); + GGML_ASSERT(&runner->loader.get() == &loader); + runner->buf = move(in_mmap); + return runner; +} diff --git a/src/models/loaders.h b/src/models/loaders.h new file mode 100644 index 0000000..7e48257 --- /dev/null +++ b/src/models/loaders.h @@ -0,0 +1,19 @@ +#pragma once + +#include "../../include/common.h" + +struct gguf_context; + +struct tts_model_loader { + /// Installs a model loader for the specified model architecture name + explicit tts_model_loader(const char * arch); + const char * const arch; + virtual unique_ptr from_file( + gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, bool cpu_only, + /* TODO move to generate() */ const generation_configuration & config) const = 0; + protected: + ~tts_model_loader() = default; +}; + +unique_ptr runner_from_file(const char * fname, int n_threads, + const generation_configuration & config, bool cpu_only = true); diff --git a/src/models/orpheus/CMakeLists.txt b/src/models/orpheus/CMakeLists.txt new file mode 100644 index 0000000..2e86004 --- /dev/null +++ b/src/models/orpheus/CMakeLists.txt @@ -0,0 +1,5 @@ +target_sources(tts PRIVATE + loader.cpp + model.cpp + model.h +) diff --git a/src/models/orpheus/loader.cpp b/src/models/orpheus/loader.cpp new file mode 100644 index 0000000..c6110a0 --- /dev/null +++ b/src/models/orpheus/loader.cpp @@ -0,0 +1,24 @@ +#include "../loaders.h" +#include "model.h" + +void orpheus_register() {} + +orpheus_model_loader::orpheus_model_loader() : tts_model_loader{ "orpheus" } {} + +unique_ptr orpheus_model_loader::from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, + int n_threads, bool cpu_only, + const generation_configuration & config) const { + orpheus_model * model = new orpheus_model; + snac_model * audio_model = new snac_model; + bpe_tokenizer * bt = bpe_tokenizer_from_gguf(meta_ctx); + model->setup_from_file(meta_ctx, weight_ctx, cpu_only); + audio_model->setup_from_file(meta_ctx, weight_ctx, cpu_only); + sampler * samp = new sampler; + snac_context * sctx = build_new_snac_context(audio_model, n_threads, cpu_only); + snac_runner * audio_decoder = new snac_runner(audio_model, sctx); + orpheus_context * octx = build_new_orpheus_context(model, n_threads, cpu_only); + orpheus_kv_cache * cache = new orpheus_kv_cache; + return make_unique(model, audio_decoder, octx, bt, samp, cache); +} + +const orpheus_model_loader orpheus_loader{}; diff --git a/src/orpheus_model.cpp b/src/models/orpheus/model.cpp similarity index 93% rename from src/orpheus_model.cpp rename to src/models/orpheus/model.cpp index 4866af2..cda5793 100644 --- a/src/orpheus_model.cpp +++ b/src/models/orpheus/model.cpp @@ -1,4 +1,4 @@ -#include "orpheus_model.h" +#include "model.h" #include @@ -386,7 +386,7 @@ std::vector> orpheus_runner::prepare_output_tokens() { return output_tokens; } -void orpheus_runner::generate_from_batch(orpheus_ubatch & batch, struct tts_response * output) { +void orpheus_runner::generate_from_batch(orpheus_ubatch & batch, tts_response & output) { while ((octx->output_tokens.size() == 0 || octx->output_tokens.back() != model->stopping_token_id) && octx->output_tokens.size() < model->max_generation_size) { decode(batch); generation_sampler->sample(octx->logits + octx->n_outputs * model->vocab_size, octx->output_tokens); @@ -401,10 +401,21 @@ void orpheus_runner::generate_from_batch(orpheus_ubatch & batch, struct tts_resp fprintf(stdout, "Warning: generation hit its max default length. The generated audio may not contain the entire prompt.\n"); } std::vector> processed_output_tokens = prepare_output_tokens(); - srunner->run(processed_output_tokens, output); + srunner->run(processed_output_tokens, &output); } -int orpheus_runner::generate(std::string sentence, struct tts_response * response) { +void orpheus_runner::generate(const char * sentence, tts_response & response, const generation_configuration & config) { + generation_sampler->temperature = config.temperature; + generation_sampler->repetition_penalty = config.repetition_penalty; + generation_sampler->do_sample = config.sample; + generation_sampler->top_k = config.top_k; + generation_sampler->top_p = config.top_p; + if (std::find(orpheus_voices.begin(), orpheus_voices.end(), config.voice) == orpheus_voices.end() && + !config.voice.empty()) { + TTS_ABORT("Voice '%s' is not a valid voice for Orpheus.", config.voice.c_str()); + } + octx->voice = config.voice; + orpheus_ubatch batch = batch_from_sentence(sentence); // it should be possible to update the max context window size, but currently it is extremely unlikely that a single prompt will // surpass the default size. @@ -417,19 +428,6 @@ int orpheus_runner::generate(std::string sentence, struct tts_response * respons orpheus_kv_cache_init(); } generate_from_batch(batch, response); - return 0; -} - -void orpheus_runner::configure_generation(generation_configuration * config) { - generation_sampler->temperature = config->temperature; - generation_sampler->repetition_penalty = config->repetition_penalty; - generation_sampler->do_sample = config->sample; - generation_sampler->top_k = config->top_k; - generation_sampler->top_p = config->top_p; - if (std::find(orpheus_voices.begin(), orpheus_voices.end(), config->voice) == orpheus_voices.end() && !config->voice.empty()) { - TTS_ABORT("Voice '%s' is not a valid voice for Orpheus.", config->voice.c_str()); - } - octx->voice = config->voice; } orpheus_ubatch orpheus_runner::build_worst_case_batch() { @@ -438,22 +436,13 @@ orpheus_ubatch orpheus_runner::build_worst_case_batch() { return batch; } -void orpheus_runner::assign_weight(std::string name, ggml_tensor * tensor) { - if (tensor->data == NULL) { - return; - } - - if (name.size() == 0) { - // handles the top level meta tensor - return; - } - - if (name.size() > 5 && name.substr(0, 5) == "snac.") { - srunner->model->assign_weight(name.substr(5), tensor); - } else if (name.size() > 8 && name.substr(0, 8) == "orpheus.") { - model->assign_weight(name.substr(8), tensor); +void orpheus_runner::assign_weight(const char * name, ggml_tensor & tensor) { + if (const string_view name_sv{ name }; name_sv.starts_with("snac.")) { + srunner->model->assign_weight(string{ name_sv.substr(sizeof("snac.") - 1) }, &tensor); + } else if (name_sv.starts_with("orpheus.")) { + model->assign_weight(string{ name_sv.substr(sizeof("orpheus.") - 1) }, &tensor); } else { - fprintf(stdout, "Warning: function %s encountered an unhandled tensor named '%s'.\n", __func__, name.c_str()); + fprintf(stdout, "Warning: function %s encountered an unhandled tensor named '%s'.\n", __func__, name); } } @@ -465,11 +454,6 @@ void orpheus_runner::prepare_post_load() { octx->prep_schedule(gf); } -std::vector list_voices() { - std::vector voices; - voices.reserve(orpheus_voices.size()); - for (auto voice : orpheus_voices) { - voices.push_back(voice); - } - return voices; +std::vector orpheus_runner::list_voices() { + return vector(cbegin(orpheus_voices), cend(orpheus_voices)); } diff --git a/src/orpheus_model.h b/src/models/orpheus/model.h similarity index 81% rename from src/orpheus_model.h rename to src/models/orpheus/model.h index 9f02d76..0e2bb94 100644 --- a/src/orpheus_model.h +++ b/src/models/orpheus/model.h @@ -1,8 +1,17 @@ #pragma once -#include "sampler.h" -#include "tokenizer.h" -#include "snac_model.h" +#include "../../decoder/snac_model.h" +#include "../../sampler.h" +#include "../../tokenizer.h" +#include "models/loaders.h" + +extern const struct orpheus_model_loader final : tts_model_loader { + explicit orpheus_model_loader(); + + unique_ptr from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, + bool cpu_only, const generation_configuration & config) const override; +} orpheus_loader; + // Orpheus uses vLLM with a llama-3 architecture. The only critical difference from the normal llama architecture is the use of kv heads. @@ -102,14 +111,14 @@ struct orpheus_ubatch { std::vector tokens; // [n_tokens] }; -struct orpheus_runner : tts_runner { +struct orpheus_runner : tts_generation_runner { orpheus_runner( orpheus_model * model, snac_runner * audio_decoder, orpheus_context * octx, bpe_tokenizer * bt, sampler * samp, - orpheus_kv_cache * cache): model(model), srunner(audio_decoder), octx(octx), tokenizer(bt), generation_sampler(samp), kv_self(cache) { + orpheus_kv_cache * cache): tts_generation_runner{orpheus_loader}, model(model), srunner(audio_decoder), octx(octx), tokenizer(bt), generation_sampler(samp), kv_self(cache) { tts_runner::sampling_rate = 24000.0f; generation_sampler->n_output_heads = 1; generation_sampler->vocab_size = model->vocab_size; @@ -126,20 +135,19 @@ struct orpheus_runner : tts_runner { tts_runner::init_build(&octx->buf_compute_meta); } - std::vector list_voices(); + std::vector list_voices() override; struct ggml_cgraph * build_orpheus_graph(orpheus_ubatch & batch); void orpheus_kv_cache_init(); void orpheus_build_kv_store(struct ggml_context * ctx, struct ggml_cgraph * graph, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, int index, uint32_t n_tokens, int repeat); - void configure_generation(generation_configuration * config); - void assign_weight(std::string name, ggml_tensor * tensor); + void assign_weight(const char * name, ggml_tensor & tensor) override; std::vector> prepare_output_tokens(); orpheus_ubatch build_worst_case_batch(); orpheus_ubatch batch_from_sentence(std::string sentence); void set_inputs(orpheus_ubatch & batch); void decode(orpheus_ubatch & batch); - void prepare_post_load(); - int generate(std::string sentence, struct tts_response * response); - void generate_from_batch(orpheus_ubatch & batch, struct tts_response * output); + void prepare_post_load() override; + void generate(const char * sentence, tts_response & response, const generation_configuration & config) override; + void generate_from_batch(orpheus_ubatch & batch, tts_response & output); }; static struct ggml_tensor * orpheus_build_layer_norm(ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * weight); diff --git a/src/models/parler/CMakeLists.txt b/src/models/parler/CMakeLists.txt new file mode 100644 index 0000000..0a540f3 --- /dev/null +++ b/src/models/parler/CMakeLists.txt @@ -0,0 +1,6 @@ +target_sources(tts PRIVATE + loader.cpp + model.cpp + model.h +) +add_subdirectory(t5) diff --git a/src/models/parler/loader.cpp b/src/models/parler/loader.cpp new file mode 100644 index 0000000..881bafc --- /dev/null +++ b/src/models/parler/loader.cpp @@ -0,0 +1,26 @@ +#include "../loaders.h" +#include "model.h" + +void parler_register() {} + +parler_model_loader::parler_model_loader() : tts_model_loader{ "parler-tts" } {} + +unique_ptr parler_model_loader::from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, + int n_threads, bool cpu_only, + const generation_configuration & config) const { + parler_tts_model * model = new parler_tts_model; + dac_model * audio_model = new dac_model; + unigram_tokenizer * ut = unigram_tokenizer_from_gguf(meta_ctx); + ut->initialize_tokenizer(); + model->use_cross_attn = config.use_cross_attn; + model->setup_from_file(meta_ctx, weight_ctx, cpu_only); + audio_model->setup_from_file(meta_ctx, weight_ctx, cpu_only); + sampler * samp = new sampler; + dac_context * dctx = build_new_dac_context(audio_model, n_threads, cpu_only); + dac_runner * audio_decoder = new dac_runner(audio_model, dctx); + parler_context * pctx = build_new_parler_context(model, n_threads, cpu_only); + parler_kv_cache * cache = new parler_kv_cache; + return make_unique(model, audio_decoder, pctx, ut, samp, cache); +} + +const parler_model_loader parler_loader{}; diff --git a/src/parler_model.cpp b/src/models/parler/model.cpp similarity index 94% rename from src/parler_model.cpp rename to src/models/parler/model.cpp index 7f4fec1..4d731d2 100644 --- a/src/parler_model.cpp +++ b/src/models/parler/model.cpp @@ -1,4 +1,4 @@ -#include "parler_model.h" +#include "model.h" // For loading parler model from gguf file. static const std::map PARLER_TENSOR_GGUF_LOOKUP = { @@ -336,10 +336,9 @@ struct parler_context * build_new_parler_context(struct parler_tts_model * model return pctx; } -static bool parler_kv_cache_init(struct parler_kv_cache * cache, parler_tts_model * model, parler_context * pctx, int32_t seq_id) { +static bool parler_kv_cache_init(struct parler_kv_cache * cache, parler_tts_model * model, parler_context * pctx) { const int64_t n_layer = (int64_t) model->layers.size(); - cache->seq_id = seq_id; - + ggml_backend_buffer_type_t buft = nullptr; // this will only really support cpu or metal for the time being; if (pctx->backend != nullptr) { @@ -498,32 +497,26 @@ static struct parler_ubatch batch_from_sentence(std::string sentence, parler_tts return batch; } -void parler_tts_runner::assign_weight(std::string name, ggml_tensor * tensor) { - std::string::size_type pos = name.find(".", 0); - std::string top_level(name.substr(0, pos)); - std::string value(name.substr(pos + 1)); - if (tensor->data == NULL) { - return; - } - if (top_level == "audio_encoder") { - dac_runner->model->assign_weight(value, tensor); - } else if (top_level == "decoder") { - model->assign_weight(value, tensor); +void parler_tts_runner::assign_weight(const char * name, ggml_tensor & tensor) { + if (const string_view name_sv{ name }; name_sv.starts_with("audio_encoder.")) { + dac_runner->model->assign_weight(string{ name_sv.substr(sizeof("audio_encoder.") - 1) }, &tensor); + } else if (name_sv.starts_with("decoder.")) { + model->assign_weight(string{ name_sv.substr(sizeof("decoder.") - 1) }, &tensor); } else { - return; + fprintf(stdout, "Warning: function %s encountered an unhandled tensor named '%s'.\n", __func__, name); } } -void parler_tts_runner::update_conditional_prompt(const std::string file_path, const std::string prompt, int n_threads, bool cpu_only) { +void parler_tts_runner::update_conditional_prompt(const char * file_path, const char * prompt) { + const int n_threads{ pctx->n_threads }; + constexpr bool cpu_only{true}; // TODO t5_runner * text_encoder = text_encoder_from_file(file_path, n_threads, tokenizer, cpu_only); tts_response* response; text_encoder->generate(prompt, response); model->prep_cross_key_values(n_threads, response); delete text_encoder; - return; } - struct ggml_cgraph * parler_tts_runner::build_parler_graph(parler_ubatch & batch) { init_build(); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false); @@ -620,15 +613,6 @@ struct ggml_cgraph * parler_tts_runner::build_parler_graph(parler_ubatch & batch return gf; } -void parler_tts_runner::configure_generation(generation_configuration * config) { - sampler->temperature = config->temperature; - sampler->repetition_penalty = config->repetition_penalty; - sampler->do_sample = config->sample; - sampler->top_k = config->top_k; - sampler->top_p = config->top_p; - model->use_cross_attn = config->use_cross_attn; -} - void parler_tts_runner::set_inputs(parler_ubatch & batch) { if (batch.audio_generation) { ggml_backend_tensor_set(pctx->audio_inp_tokens, batch.audio_tokens, 0, batch.n_audio_tokens*ggml_element_size(pctx->audio_inp_tokens)); @@ -718,17 +702,16 @@ parler_ubatch parler_tts_runner::build_worst_case_batch() { } void parler_tts_runner::prepare_post_load() { + if (model->use_cross_attn) { + model->prep_cross_key_values(pctx->n_threads); + } dac_runner->prepare_post_load(); - parler_kv_cache_init(kv_self, model, pctx, std::mt19937(std::random_device{}())()); + parler_kv_cache_init(kv_self, model, pctx); auto batch = build_worst_case_batch(); auto gf = build_parler_graph(batch); pctx->prep_schedule(gf); } -bool parler_tts_runner::adjust_for_sequence_continuation(struct parler_ubatch & batch) { - return false; // not implemneted -} - bool parler_tts_runner::check_stopping() { int32_t token_position = (int32_t) pctx->output_tokens.size() - (int32_t) model->n_output_heads; if (token_position < 0) { @@ -776,7 +759,7 @@ void parler_tts_runner::adjust_output_tokens(std::vector & output_toke } } -int parler_tts_runner::generate_from_batch(parler_ubatch & batch, struct tts_response * output) { +int parler_tts_runner::generate_from_batch(parler_ubatch & batch, tts_response & output) { std::vector next_decoder_token_ids; next_decoder_token_ids.reserve(model->n_output_heads); @@ -804,7 +787,7 @@ int parler_tts_runner::generate_from_batch(parler_ubatch & batch, struct tts_res std::vector filtered_output_tokens; adjust_output_tokens(pctx->output_tokens, filtered_output_tokens); - dac_runner->run(filtered_output_tokens.data(), (int32_t) filtered_output_tokens.size() / model->n_output_heads, output); + dac_runner->run(filtered_output_tokens.data(), (int32_t) filtered_output_tokens.size() / model->n_output_heads, &output); return 0; } @@ -815,7 +798,7 @@ int parler_tts_runner::generate_audio_tokens(std::string sentence) { int32_t seq_id = std::mt19937(std::random_device{}())(); if (!kv_self) { kv_self = new parler_kv_cache; - if (!parler_kv_cache_init(kv_self, model, pctx, seq_id)) { + if (!parler_kv_cache_init(kv_self, model, pctx)) { return 1; } } @@ -852,23 +835,24 @@ void parler_tts_runner::just_audio_token_decode(uint32_t * tokens, int32_t sq_le dac_runner->run(tokens, sq_len, outputs); } -int parler_tts_runner::generate(std::string sentence, struct tts_response * output, int32_t seq_id) { +void parler_tts_runner::generate(const char * sentence, tts_response & output, + const generation_configuration & config) { + sampler->temperature = config.temperature; + sampler->repetition_penalty = config.repetition_penalty; + sampler->do_sample = config.sample; + sampler->top_k = config.top_k; + sampler->top_p = config.top_p; + model->use_cross_attn = config.use_cross_attn; + parler_ubatch batch = batch_from_sentence(sentence, model, tokenizer); pctx->reset(model->n_output_heads); sampler->reset(); - if (pctx->seq_id != seq_id || seq_id == -1) { - seq_id = std::mt19937(std::random_device{}())(); - pctx->current_position = 0; - if (!kv_self) { - kv_self = new parler_kv_cache; - if (!parler_kv_cache_init(kv_self, model, pctx, seq_id)) { - return 1; - } - } - } else { - if (!adjust_for_sequence_continuation(batch)) { - return 2; + pctx->current_position = 0; + if (!kv_self) { + kv_self = new parler_kv_cache; + if (!parler_kv_cache_init(kv_self, model, pctx)) { + return; } } - return generate_from_batch(batch, output); + generate_from_batch(batch, output); } diff --git a/src/parler_model.h b/src/models/parler/model.h similarity index 88% rename from src/parler_model.h rename to src/models/parler/model.h index 463910f..529db71 100644 --- a/src/parler_model.h +++ b/src/models/parler/model.h @@ -1,9 +1,16 @@ -#ifndef parler_model_h -#define parler_model_h +#pragma once -#include "dac_model.h" -#include "t5_encoder_model.h" -#include "sampler.h" +#include "../../decoder/dac_model.h" +#include "../../sampler.h" +#include "models/loaders.h" +#include "t5/model.h" + +extern const struct parler_model_loader final : tts_model_loader { + explicit parler_model_loader(); + + unique_ptr from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, + bool cpu_only, const generation_configuration & config) const override; +} parler_loader; enum parler_tensor { PARLER_EMBD, @@ -112,8 +119,7 @@ struct parler_context : runner_context { int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch uint32_t current_position = 0; // current position in the active sequence uint32_t prompt_end_position = 0; // the position of the text prompt termination (used for adjusting the cache when incrementally generating) - int32_t seq_id; // a unique identifier associated with the active sequence. - + std::vector output_tokens; struct ggml_tensor * inp_tokens; @@ -129,8 +135,6 @@ struct parler_context : runner_context { }; struct parler_kv_cache { - int32_t seq_id; - ggml_type type_k = GGML_TYPE_F32; ggml_type type_v = GGML_TYPE_F32; @@ -168,7 +172,7 @@ struct parler_ubatch { }; struct parler_context * build_new_parler_context(struct parler_tts_model * model, int n_threads, bool use_cpu = true); -static bool parler_kv_cache_init(struct parler_kv_cache * cache, parler_tts_model * model, parler_context * pctx, int32_t seq_id); +static bool parler_kv_cache_init(struct parler_kv_cache * cache, parler_tts_model * model, parler_context * pctx); struct ggml_tensor * parler_build_inp_embd(struct ggml_context * ctx, struct parler_context * pctx, parler_tts_model * model, const parler_ubatch & batch); struct ggml_tensor * parler_build_layer_norm(struct ggml_context * ctx, struct ggml_tensor * inputs, struct ggml_tensor * weight, struct ggml_tensor * bias); @@ -180,8 +184,8 @@ static struct parler_ubatch batch_from_sentence(std::string sentence, parler_tts // This struct is intended to support end-to-end TTS generation. As such, it manages the parler tts model compilation, compute and generation process, // the tokenization and sampling process, and uses the dac_runner struct to encode audio outputs. -struct parler_tts_runner : tts_runner { - parler_tts_runner(parler_tts_model * model, dac_runner * audio_decoder, parler_context * pctx, unigram_tokenizer * ut, sampler * samp, parler_kv_cache * cache): model(model), dac_runner(audio_decoder), pctx(pctx), tokenizer(ut), sampler(samp), kv_self(cache) {}; +struct parler_tts_runner : tts_generation_runner { + parler_tts_runner(parler_tts_model * model, dac_runner * audio_decoder, parler_context * pctx, unigram_tokenizer * ut, sampler * samp, parler_kv_cache * cache): tts_generation_runner{parler_loader}, model(model), dac_runner(audio_decoder), pctx(pctx), tokenizer(ut), sampler(samp), kv_self(cache) {}; ~parler_tts_runner() { if (ctx) { ggml_free(ctx); @@ -204,22 +208,18 @@ struct parler_tts_runner : tts_runner { tts_runner::init_build(&pctx->buf_compute_meta); } - void configure_generation(generation_configuration * config); - void assign_weight(std::string name, ggml_tensor * tensor); + void assign_weight(const char * name, ggml_tensor & tensor) override; parler_ubatch build_worst_case_batch(); struct ggml_cgraph * build_parler_graph(parler_ubatch & batch); void set_inputs(parler_ubatch & batch); int decode(parler_ubatch & batch); - void prepare_post_load(); - bool adjust_for_sequence_continuation(struct parler_ubatch & batch); - int generate(std::string sentence, struct tts_response * response, int32_t seq_id = -1); + void prepare_post_load() override; + void generate(const char * sentence, tts_response & output, const generation_configuration & config) override; bool check_stopping(); void adjust_output_tokens(std::vector & output_tokens, std::vector & filtered); - int generate_from_batch(parler_ubatch & batch, struct tts_response * output); + int generate_from_batch(parler_ubatch & batch, tts_response & output); void parler_graph_compute(ggml_cgraph * gf); void just_audio_token_decode(uint32_t * tokens, int32_t sq_len, struct tts_response * output); int generate_audio_tokens(std::string sentence); - void update_conditional_prompt(const std::string file_path, const std::string prompt, int n_threads, bool cpu_only = true); + void update_conditional_prompt(const char * file_path, const char * prompt) override; }; - -#endif diff --git a/src/models/parler/t5/CMakeLists.txt b/src/models/parler/t5/CMakeLists.txt new file mode 100644 index 0000000..7ba20a4 --- /dev/null +++ b/src/models/parler/t5/CMakeLists.txt @@ -0,0 +1,4 @@ +target_sources(tts PRIVATE + model.cpp + model.h +) diff --git a/src/t5_encoder_model.cpp b/src/models/parler/t5/model.cpp similarity index 99% rename from src/t5_encoder_model.cpp rename to src/models/parler/t5/model.cpp index 2dbc761..3751278 100644 --- a/src/t5_encoder_model.cpp +++ b/src/models/parler/t5/model.cpp @@ -1,4 +1,4 @@ -#include "t5_encoder_model.h" +#include "model.h" static const std::map T5_TENSOR_GGUF_LOOKUP = { {"t5encoder.token_embd", T5_EMBD}, @@ -139,7 +139,7 @@ void t5_encoder::prep_constants(gguf_context * meta) { int bos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.bos_token_id"); if (bos_token_id_key != -1) { bos_token_id = gguf_get_val_u32(meta, bos_token_id_key); - } + } int eos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.eos_token_id"); if (eos_token_id_key != -1) { @@ -219,7 +219,7 @@ struct ggml_cgraph * t5_runner::build_t5_graph(t5_ubatch & batch) { struct ggml_tensor * cur; struct ggml_tensor * inpL; - + //t5ctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens); //ggml_set_input(t5ctx->positions); @@ -233,7 +233,7 @@ struct ggml_cgraph * t5_runner::build_t5_graph(t5_ubatch & batch) { struct ggml_tensor * KQ_mask_dec = build_t5_attn_mask(ctx, t5ctx, batch); struct ggml_tensor * pos_bias = build_t5_pos_bias(ctx, t5ctx->inp_pos_bucket, model->relative_attn_bias); - + for (int l = 0; l < model->n_layers; l++) { struct ggml_tensor * residual = inpL; @@ -293,7 +293,7 @@ struct ggml_cgraph * t5_runner::build_t5_graph(t5_ubatch & batch) { ggml_build_forward_expand(gf, cur); free_build(); - + return gf; } @@ -312,7 +312,7 @@ void t5_runner::set_inputs(t5_ubatch & batch) { for (int ii = 0; ii < batch.n_tokens; ii++) { int ab_rpos = abs(i - ii); int rpos = i - ii; - attn_mask[i*batch.n_tokens + ii] = 0.0f; //ii > i ? -INFINITY : 0.0f; + attn_mask[i*batch.n_tokens + ii] = 0.0f; //ii > i ? -INFINITY : 0.0f; pos_bucket[i*batch.n_tokens + ii] = (uint32_t) (rpos > 0 ? n_buckets : 0) + (ab_rpos < max_exact ? ab_rpos : std::min((n_buckets - 1), (max_exact + (int)((log((ab_rpos / max_exact)) / logarithmic_denominator) * max_exact)))); } } @@ -324,10 +324,10 @@ void t5_runner::run(uint32_t * input_tokens, uint32_t sequence_length, struct tt batch.input_tokens = input_tokens; batch.n_tokens = sequence_length; ggml_backend_sched_reset(t5ctx->sched); - + const size_t prev_size = t5ctx->buf_output ? ggml_backend_buffer_get_size(t5ctx->buf_output) : 0; const size_t new_size = model->max_context_length * model->output_size * sizeof(float); - + if (!t5ctx->buf_output || prev_size < new_size) { if (t5ctx->buf_output) { ggml_backend_buffer_free(t5ctx->buf_output); @@ -337,7 +337,7 @@ void t5_runner::run(uint32_t * input_tokens, uint32_t sequence_length, struct tt t5ctx->buf_output = ggml_backend_buft_alloc_buffer(t5ctx->backend_cpu_buffer, new_size); } - + outputs->data = (float *) ggml_backend_buffer_get_base(t5ctx->buf_output); ggml_backend_buffer_clear(t5ctx->buf_output, 0); struct ggml_cgraph * gf = NULL; diff --git a/src/t5_encoder_model.h b/src/models/parler/t5/model.h similarity index 97% rename from src/t5_encoder_model.h rename to src/models/parler/t5/model.h index 9a80187..423ccce 100644 --- a/src/t5_encoder_model.h +++ b/src/models/parler/t5/model.h @@ -1,9 +1,7 @@ -#ifndef t5_encoder_model_h -#define t5_encoder_model_h - -#include "tts_model.h" -#include "tokenizer.h" +#pragma once +#include "../../../tokenizer.h" +#include "../../../tts_model.h" enum t5_tensor { T5_EMBD, @@ -126,5 +124,3 @@ struct t5_runner : tts_runner { }; struct t5_runner * text_encoder_from_file(std::string file_path, int n_threads, unigram_tokenizer * tokenizer, bool cpu_only = true); - -#endif diff --git a/src/tts.cpp b/src/tts.cpp deleted file mode 100644 index f5faf28..0000000 --- a/src/tts.cpp +++ /dev/null @@ -1,445 +0,0 @@ -#include "tts.h" -#include - -// A list of all of the top level GGUF names under kokoro.duration_predictor that have quantization compatible tensors. -static constexpr std::array DURATION_PREDICTOR_QUANTIZATION_COMPATIBLE_PARTS = { - "duration_proj", - "encode", - "shared_lstm", - "duration_lstm", - "layers" -}; - -struct tts_runner * orpheus_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) { - orpheus_model * model = new orpheus_model; - snac_model * audio_model = new snac_model; - bpe_tokenizer * bt = bpe_tokenizer_from_gguf(meta_ctx); - model->setup_from_file(meta_ctx, weight_ctx, cpu_only); - audio_model->setup_from_file(meta_ctx, weight_ctx, cpu_only); - sampler * samp = new sampler; - snac_context * sctx = build_new_snac_context(audio_model, n_threads, cpu_only); - snac_runner * audio_decoder = new snac_runner(audio_model, sctx); - orpheus_context * octx = build_new_orpheus_context(model, n_threads, cpu_only); - orpheus_kv_cache * cache = new orpheus_kv_cache; - orpheus_runner * runner = new orpheus_runner(model, audio_decoder, octx, bt, samp, cache); - - for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) { - runner->assign_weight(cur->name, cur); - } - - runner->prepare_post_load(); - - gguf_free(meta_ctx); - ggml_free(weight_ctx); - runner->arch = arch; - - return (tts_runner*)runner; -} - -struct tts_runner * parler_tts_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) { - parler_tts_model * model = new parler_tts_model; - dac_model * audio_model = new dac_model; - unigram_tokenizer * ut = unigram_tokenizer_from_gguf(meta_ctx); - ut->initialize_tokenizer(); - model->use_cross_attn = config->use_cross_attn; - model->setup_from_file(meta_ctx, weight_ctx, cpu_only); - audio_model->setup_from_file(meta_ctx, weight_ctx, cpu_only); - struct sampler * samp = new sampler; - struct dac_context * dctx = build_new_dac_context(audio_model, n_threads, cpu_only); - struct dac_runner * audio_decoder = new dac_runner(audio_model, dctx); - struct parler_context * pctx = build_new_parler_context(model, n_threads, cpu_only); - struct parler_kv_cache * cache = new parler_kv_cache; - struct parler_tts_runner * runner = new parler_tts_runner(model, audio_decoder, pctx, ut, samp, cache); - - // TODO: change this weight assignment pattern to mirror llama.cpp - for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) { - runner->assign_weight(cur->name, cur); - } - - if (config->use_cross_attn) { - runner->model->prep_cross_key_values(n_threads); - } - - runner->prepare_post_load(); - - gguf_free(meta_ctx); - ggml_free(weight_ctx); - runner->arch = arch; - - return (tts_runner*)runner; -} - -struct tts_runner * kokoro_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) { - kokoro_model * model = new kokoro_model; - single_pass_tokenizer * spt = single_pass_tokenizer_from_gguf(meta_ctx, "tokenizer.ggml.tokens"); - model->setup_from_file(meta_ctx, weight_ctx, cpu_only); - struct kokoro_duration_context * kdctx = build_new_duration_kokoro_context(model, n_threads, cpu_only); - struct kokoro_duration_runner * duration_runner = new kokoro_duration_runner(model, kdctx, spt); - struct kokoro_context * kctx = build_new_kokoro_context(model, n_threads, cpu_only); - // if an espeak voice id wasn't specifically set infer it from the kokoro voice, if it was override it, otherwise fallback to American English. - std::string espeak_voice_id = config->espeak_voice_id; - if (espeak_voice_id.empty()) { - espeak_voice_id = !config->voice.empty() && KOKORO_LANG_TO_ESPEAK_ID.find(config->voice.at(0)) != KOKORO_LANG_TO_ESPEAK_ID.end() ? KOKORO_LANG_TO_ESPEAK_ID[config->voice.at(0)] : "gmw/en-US"; - } - struct phonemizer * phmzr = phonemizer_from_gguf(meta_ctx, espeak_voice_id); - struct kokoro_runner * runner = new kokoro_runner(model, kctx, spt, duration_runner, phmzr); - - // TODO: change this weight assignment pattern to mirror llama.cpp - for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) { - runner->assign_weight(cur->name, cur); - } - - runner->prepare_post_load(); - - gguf_free(meta_ctx); - ggml_free(weight_ctx); - runner->arch = arch; - - return (tts_runner*)runner; -} - -struct tts_runner * dia_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) { - dia_model * model = new dia_model; - dac_model * audio_model = new dac_model; - model->setup_from_file(meta_ctx, weight_ctx, cpu_only); - audio_model->setup_from_file(meta_ctx, weight_ctx, cpu_only); - struct sampler * samp = new sampler; - struct dac_context * dctx = build_new_dac_context(audio_model, n_threads, cpu_only); - struct dac_runner * audio_decoder = new dac_runner(audio_model, dctx); - struct dia_context * diactx = build_new_dia_context(model, n_threads, cpu_only); - struct dia_kv_cache * cache = new dia_kv_cache; - struct dia_runner * runner = new dia_runner(model, audio_decoder, diactx, samp, cache); - - for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) { - runner->assign_weight(cur->name, cur); - } - - runner->prepare_post_load(); - - gguf_free(meta_ctx); - ggml_free(weight_ctx); - runner->arch = arch; - - return (tts_runner*)runner; -} - -// currently only metal and cpu devices are supported, so cpu_only only describes whether or not to try to load and run on metal. -struct tts_runner * runner_from_file(const std::string & fname, int n_threads, generation_configuration * config, bool cpu_only) { - ggml_context * weight_ctx = NULL; - - struct gguf_init_params params = { - /*.no_alloc =*/ false, - /*.ctx =*/ &weight_ctx, - }; - gguf_context * meta_ctx = gguf_init_from_file(fname.c_str(), params); - if (!meta_ctx) { - TTS_ABORT("%s failed for file %s\n", __func__, fname.c_str()); - } - int arch_key = gguf_find_key(meta_ctx, "general.architecture"); - if (arch_key == -1) { - TTS_ABORT("%s failed for file %s. No architecture is set.\n", __func__, fname.c_str()); - } - std::string arch = std::string(gguf_get_val_str(meta_ctx, arch_key)); - if (SUPPORTED_ARCHITECTURES.find(arch) == SUPPORTED_ARCHITECTURES.end()) { - TTS_ABORT("%s failed for file %s. The architecture '%s' is not supported.", __func__, fname.c_str(), arch.c_str()); - } - tts_arch arch_type = SUPPORTED_ARCHITECTURES.at(arch); - switch(arch_type) { - case PARLER_TTS_ARCH: - return parler_tts_from_file(meta_ctx, weight_ctx, n_threads, config, arch_type, cpu_only); - case KOKORO_ARCH: - return kokoro_from_file(meta_ctx, weight_ctx, n_threads, config, arch_type, cpu_only); - case DIA_ARCH: - return dia_from_file(meta_ctx, weight_ctx, n_threads, config, arch_type, cpu_only); - case ORPHEUS_ARCH: - return orpheus_from_file(meta_ctx, weight_ctx, n_threads, config, arch_type, cpu_only); - default: - TTS_ABORT("%s failed for file %s. The architecture '%s' is not supported.", __func__, fname.c_str(), arch.c_str()); - } -} - -int generate(tts_runner * runner, std::string sentence, struct tts_response * response, generation_configuration * config) { - switch(runner->arch) { - case PARLER_TTS_ARCH: - ((parler_tts_runner*)runner)->configure_generation(config); - return ((parler_tts_runner*)runner)->generate(sentence, response); - case KOKORO_ARCH: - return ((kokoro_runner*)runner)->generate(sentence, response, config->voice, config->espeak_voice_id); - case DIA_ARCH: - ((dia_runner*)runner)->configure_generation(config); - return ((dia_runner*)runner)->generate(sentence, response); - case ORPHEUS_ARCH: - ((orpheus_runner*)runner)->configure_generation(config); - return ((orpheus_runner*)runner)->generate(sentence, response); - default: - TTS_ABORT("%s failed. The architecture '%d' is not supported.", __func__, runner->arch); - } -} - -std::vector list_voices(tts_runner * runner) { - switch(runner->arch) { - case KOKORO_ARCH: - return ((kokoro_runner*)runner)->list_voices(); - default: - TTS_ABORT("%s failed. The architecture '%d' does not support #list_voices supported.", __func__, runner->arch); - } -} - -void update_conditional_prompt(tts_runner * runner, const std::string file_path, const std::string prompt, bool cpu_only) { - int n_threads = ((parler_tts_runner*)runner)->pctx->n_threads; - ((parler_tts_runner*)runner)->update_conditional_prompt(file_path, prompt, n_threads, cpu_only); -} - -bool kokoro_is_f16_compatible(std::string name) { - return name.find("voice_tensors") == std::string::npos && - name.find("bias") == std::string::npos && - name.find("gamma") == std::string::npos && - name.find("beta") == std::string::npos && - name.find("alpha") == std::string::npos && - !has_suffix(name, "embd") && - !has_suffix(name, "norm"); -} - -bool kokoro_is_quantizable(std::string name, struct quantization_params * params) { - if (kokoro_is_f16_compatible(name)) { - if (has_prefix(name, "kokoro.albert") || has_prefix(name, "kokoro.text_encoder.lstm")) { - return true; - } else if (has_prefix(name, "kokoro.duration_predictor.")) { - std::vector parts = split(name, "."); - for (std::string part : DURATION_PREDICTOR_QUANTIZATION_COMPATIBLE_PARTS) { - if (part == parts[2]) { - return true; - } - } - } - } - return false; -} - -bool dia_is_quantizable(std::string name, struct quantization_params * params) { - // The DAC audio encoder / decoder is not compatible with quantization and normalization tensors should not be quantized. - bool quantizable = !has_prefix(name, "audio_encoder") && !has_suffix(name, "norm"); - if (!params->quantize_output_heads) { - quantizable = quantizable && !has_prefix(name, "dia.decoder.heads"); - } - return quantizable; -} - -bool parler_is_quanitizable(std::string name, struct quantization_params * params) { - // the DAC audio encoder / decoder is not compatible with quantization, normalization weight shouldn't be quantized, and the text encoding shouldn't be normalized. - bool quantizable = !has_prefix(name, "audio_encoder") && !has_suffix(name, "norm.weight") && !has_suffix(name, "text_encoding") && !has_suffix(name, "positional_embed") && !has_suffix(name, "norm.bias"); - if (!params->quantize_output_heads) { - quantizable = quantizable && !has_suffix(name, "weight.head"); - } - if (!params->quantize_text_embeddings) { - quantizable = quantizable && !has_suffix(name, "embed_prompts"); - } - if (!params->quantize_cross_attn_kv) { - quantizable = quantizable && !has_suffix(name, "encoder_attn.k_proj.weight") && !has_suffix(name, "encoder_attn.v_proj.weight"); - } - return quantizable; -} - -bool is_quantizable(tts_arch arch, std::string name, struct quantization_params * params) { - switch(arch) { - case PARLER_TTS_ARCH: - return parler_is_quanitizable(name, params); - case DIA_ARCH: - return dia_is_quantizable(name, params); - case KOKORO_ARCH: - return kokoro_is_quantizable(name, params); - default: - TTS_ABORT("%s failed. The architecture '%d' is not supported.", __func__, arch); - } -} - -size_t quantize_tensor(void * new_data, struct ggml_tensor * tensor, const float * imatrix, enum ggml_type qtype, uint32_t n_threads) { - // much of this is form copied from llama.cpp - int chunk_size_multiplier = 1; - if (qtype == GGML_TYPE_Q4_0_4_4 || qtype == GGML_TYPE_Q4_0_4_8 || qtype == GGML_TYPE_Q4_0_8_8) { - if ((qtype == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) qtype = GGML_TYPE_Q4_0; - else if (tensor->ne[1] % 4 != 0) qtype = GGML_TYPE_Q4_0; - if (qtype == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8; - else if (qtype == GGML_TYPE_Q4_0_4_4 || qtype == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4; - } - size_t out_size = 0; - const int32_t d3_step = tensor->ne[0] * tensor->ne[1]; - const int32_t n_per_row = tensor->ne[0]; - const int32_t nrows = tensor->ne[1]; - static const int32_t min_chunk_size = 32 * 512; - const int32_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) * chunk_size_multiplier; - uint32_t thread_count = std::max(1, std::min((int)n_threads, (int)(d3_step + chunk_size - 1) / chunk_size)); - std::mutex mutex; - - for (int32_t d3_index = 0; d3_index < tensor->ne[2]; d3_index++) { - const float * f32_data_d3 = ((float *) tensor->data) + d3_index * d3_step; - void * new_data_d3 = (char *)new_data + ggml_row_size(qtype, tensor->ne[0]) * d3_index * nrows; - const float * imatrix_03 = imatrix ? imatrix + d3_index * tensor->ne[0] : nullptr; - if (thread_count <= 1) { - // not threaded - out_size += ggml_quantize_chunk(qtype, f32_data_d3, new_data_d3, 0, nrows, n_per_row, imatrix); - } else { - std::vector threads; - int64_t counter = 0; - size_t new_size = 0; - bool valid = true; - for (uint32_t t = 0; t < thread_count; t++) { - auto func = [&mutex, &counter, &new_size, &valid, qtype, f32_data_d3, new_data_d3, chunk_size, nrows, n_per_row, imatrix]() { - const int64_t nrows_per_chunk = chunk_size / n_per_row; - size_t local_size = 0; - while (true) { - std::unique_lock lock(mutex); - int64_t first_row = counter; - counter += nrows_per_chunk; - if (first_row >= nrows) { - if (local_size > 0) { - new_size += local_size; - } - break; - } - lock.unlock(); - const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk); - size_t this_size = ggml_quantize_chunk(qtype, f32_data_d3, new_data_d3, first_row * n_per_row, this_nrow, n_per_row, imatrix); - local_size += this_size; - - // validate the quantized data; I am not sure how this would occur, but there is always the safe fallback on doing this single threaded. - const size_t row_size = ggml_row_size(qtype, n_per_row); - void * this_data = (char *) new_data_d3 + first_row * row_size; - if (!ggml_validate_row_data(qtype, this_data, this_size)) { - std::unique_lock lock(mutex); - valid = false; - break; - } - } - }; - threads.push_back(std::thread(func)); - } - for (auto & t : threads) t.join(); - - if (!valid) { - TTS_ABORT("Validation of quantized data failed. Please try again and/or switch to single thread quantization.\n"); - } - out_size += new_size; - } - } - return out_size; -} - -static void zeros(std::ofstream & file, size_t n) { - char zero = 0; - for (size_t i = 0; i < n; ++i) { - file.write(&zero, 1); - } -} - -template -struct no_init { - T value; - no_init() { /* do nothing */ } -}; - -void quantize_gguf(const std::string & ifile, const std::string & ofile, struct quantization_params * params) { - ggml_context * weight_ctx = NULL; - struct gguf_init_params gguf_params = { - /*.no_alloc =*/ false, - /*.ctx =*/ &weight_ctx, - }; - gguf_context * meta_ctx = gguf_init_from_file(ifile.c_str(), gguf_params); - std::string arch = "parler-tts"; // only parler-tts gguf files should lack an explicit architecture. - - int arch_key = gguf_find_key(meta_ctx, "general.architecture"); - if (arch_key != -1) { - arch = std::string(gguf_get_val_str(meta_ctx, arch_key)); - } - tts_arch arch_type = SUPPORTED_ARCHITECTURES.at(arch); - - if (params->quantize_type != GGML_TYPE_Q5_0 && params->quantize_type != GGML_TYPE_Q8_0 && params->quantize_type != GGML_TYPE_F16 && params->quantize_type != GGML_TYPE_Q4_0) { - fprintf(stdout, "Warning, %s is untested for quantization type '%d'. Use at your own risk.\n", arch.c_str(), params->quantize_type); - } - - const size_t align = GGUF_DEFAULT_ALIGNMENT; - gguf_context_ptr ctx_out { gguf_init_empty() }; - - // copy the KV pairs from the input file - gguf_set_kv(ctx_out.get(), meta_ctx); - gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); - gguf_set_val_u32(ctx_out.get(), "general.quantization_type", params->quantize_type); - for (ggml_tensor * tensor = ggml_get_first_tensor(weight_ctx); tensor; tensor = ggml_get_next_tensor(weight_ctx, tensor)) { - std::string name = ggml_get_name(tensor); - if (name.size() != 0) { - gguf_add_tensor(ctx_out.get(), tensor); - } - } - - std::vector> work; - - std::ofstream fout; - auto close_ofstream = [&]() { - // Write metadata and close file handler - if (fout.is_open()) { - fout.seekp(0); - std::vector data(gguf_get_meta_size(ctx_out.get())); - gguf_get_meta_data(ctx_out.get(), data.data()); - fout.write((const char *) data.data(), data.size()); - fout.close(); - } - }; - auto new_ofstream = [&]() { - std::string fname = ofile; - fout = std::ofstream(fname, std::ios::binary); - fout.exceptions(std::ofstream::failbit); // fail fast on write errors - const size_t meta_size = gguf_get_meta_size(ctx_out.get()); - // placeholder for the meta data - ::zeros(fout, meta_size); - }; - new_ofstream(); - for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) { - enum ggml_type new_type; - void * new_data; - size_t new_size; - std::string name = ggml_get_name(cur); - - if (name.size() == 0) { - continue; - } - - if (is_quantizable(arch_type, name, params)) { - if ((cur->type) != GGML_TYPE_F32) { - TTS_ABORT("ERROR: All quantized tensors must be transformed from 32bit floats. Tensor, '%s', has improper type, '%d'\n", cur->name, cur->type); - } - new_type = params->quantize_type; - if ((new_type >= GGML_TYPE_IQ2_XXS && new_type <= GGML_TYPE_IQ4_XS)) { - TTS_ABORT("ERROR: Quantization type '%d' requires an importance matrix.\n", new_type); - } - const int64_t nelement_size = ggml_nelements(cur) * 4; - if (work.size() < (size_t)nelement_size) { - work.resize(nelement_size); // upper bound on size - } - new_data = work.data(); - new_size = quantize_tensor(new_data, cur, nullptr, new_type, params->n_threads); - } else if ((params->convert_non_quantizable_to_f16 && kokoro_is_f16_compatible(name)) || (params->convert_dac_to_f16 && has_prefix(name, "audio_encoder") && !has_suffix(name, "alpha"))) { - if ((cur->type) != GGML_TYPE_F32) { - TTS_ABORT("ERROR: All converted tensors must be transformed from 32bit floats. Tensor, '%s', has improper type, '%d'\n", cur->name, cur->type); - } - new_type = GGML_TYPE_F16; - const int64_t nelement_size = ggml_nelements(cur) * 4; - if (work.size() < (size_t)nelement_size) { - work.resize(nelement_size); // upper bound on size - } - new_data = work.data(); - new_size = quantize_tensor(new_data, cur, nullptr, new_type, params->n_threads); - } else { - new_type = cur->type; - new_data = cur->data; - new_size = ggml_nbytes(cur); - } - - gguf_set_tensor_type(ctx_out.get(), name.c_str(), new_type); - gguf_set_tensor_data(ctx_out.get(), name.c_str(), new_data, new_size); - fprintf(stdout, "At tensor: '%s' with new size: %zu bytes\n", name.c_str(), new_size); - // write tensor data + padding - fout.write((const char *) new_data, new_size); - zeros(fout, GGML_PAD(new_size, align) - new_size); - } - close_ofstream(); -} diff --git a/src/tts_model.cpp b/src/tts_model.cpp index 8fb8412..cb1924e 100644 --- a/src/tts_model.cpp +++ b/src/tts_model.cpp @@ -1,18 +1,21 @@ #include "tts_model.h" +#include "llama-mmap.h" + #include "ggml-backend.h" #include "ggml-cpu.h" +#include "models/loaders.h" -void append_to_response(struct tts_response * response, struct tts_response * to_append) { - float * new_data = (float *) malloc((response->n_outputs + to_append->n_outputs) * sizeof(float)); - if (response->n_outputs > 0) { - std::memcpy(new_data, response->data, response->n_outputs*sizeof(float)); +void append_to_response(tts_response & response, tts_response & to_append) { + float * new_data = (float *) malloc((response.n_outputs + to_append.n_outputs) * sizeof(float)); + if (response.n_outputs > 0) { + std::memcpy(new_data, response.data, response.n_outputs*sizeof(float)); } - if (to_append->n_outputs > 0) { - float * next_loc = new_data + response->n_outputs; - std::memcpy(next_loc, to_append->data, to_append->n_outputs*sizeof(float)); + if (to_append.n_outputs > 0) { + float * next_loc = new_data + response.n_outputs; + std::memcpy(next_loc, to_append.data, to_append.n_outputs*sizeof(float)); } - response->data = new_data; - response->n_outputs += to_append->n_outputs; + response.data = new_data; + response.n_outputs += to_append.n_outputs; } /* @@ -97,6 +100,18 @@ void tts_runner::free_build() { } } +tts_generation_runner::tts_generation_runner(const tts_model_loader & loader) : loader{ ref(loader) } {} + +tts_generation_runner::~tts_generation_runner() {} + +std::vector tts_generation_runner::list_voices() { + GGML_ABORT("The architecture '%s' does not support #list_voices.", loader.get().arch); +} + +void tts_generation_runner::update_conditional_prompt(const char * file_path, const char * prompt) { + GGML_ABORT("The architecture '%s' does not support update_conditional_prompt.", loader.get().arch); +} + void tts_model::prep_buffers_and_context(bool cpu_only, float size_offset, uint32_t dedicated_add_on_size) { // currently DAC is only supported on cpu because the ops are not implemented on other devices; if (cpu_only) { diff --git a/src/tts_model.h b/src/tts_model.h index 93d0a21..0bbd21e 100644 --- a/src/tts_model.h +++ b/src/tts_model.h @@ -3,10 +3,13 @@ #include #include +#include #include "util.h" #include "common.h" -void append_to_response(struct tts_response * response, struct tts_response * to_append); +using namespace std; + +void append_to_response(tts_response & response, tts_response & to_append); using tensor_meta_callback = std::function*;