diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..907598d
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,162 @@
+# Adapted from llama.cpp
+# fab5d30ff6729ff6ff615c41e8c0215d6bc30393 by Diego Devesa
+---
+Language:        Cpp
+AlignAfterOpenBracket: Align
+AlignArrayOfStructures: Left
+AlignConsecutiveAssignments: AcrossComments
+AlignConsecutiveBitFields: AcrossComments
+AlignConsecutiveDeclarations: AcrossComments
+AlignConsecutiveMacros: AcrossComments
+# AlignConsecutiveShortCaseStatements: AcrossComments
+AlignEscapedNewlines: Left # LeftWithLastLine
+AlignOperands:   Align
+AlignTrailingComments:
+  Kind: Always
+  OverEmptyLines: 1
+AllowAllArgumentsOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: false
+# AllowBreakBeforeNoexceptSpecifier: OnlyWithParen
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Inline
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLambdasOnASingleLine: Inline
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakBeforeMultilineStrings: true
+BinPackArguments: true
+BinPackParameters: true # OnePerLine
+BitFieldColonSpacing: Both
+BreakBeforeBraces: Custom # Attach
+BraceWrapping:
+  AfterCaseLabel:  true
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  AfterExternBlock: false
+  BeforeCatch:     false
+  BeforeElse:      false
+  BeforeLambdaBody: false
+  BeforeWhile: false
+  IndentBraces:    false
+  SplitEmptyFunction: false
+  SplitEmptyRecord: false
+  SplitEmptyNamespace: false
+# BreakAdjacentStringLiterals: true
+BreakAfterAttributes: Never
+BreakBeforeBinaryOperators: None
+BreakBeforeInlineASMColon: OnlyMultiline
+BreakBeforeTernaryOperators: false
+# BreakBinaryOperations: Never
+BreakConstructorInitializers: AfterColon
+# BreakFunctionDefinitionParameters: false
+BreakInheritanceList: AfterComma
+BreakStringLiterals: true
+# BreakTemplateDeclarations: Yes
+ColumnLimit:     120
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: false
+DerivePointerAlignment: false
+DisableFormat:   false
+EmptyLineBeforeAccessModifier: Leave
+EmptyLineAfterAccessModifier: Never
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+IncludeBlocks:   Regroup
+IncludeCategories:
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+    SortPriority:    0
+  - Regex:           '^<.*'
+    Priority:        2
+    SortPriority:    0
+  - Regex:           '.*'
+    Priority:        3
+    SortPriority:    0
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IncludeIsMainSourceRegex: ''
+IndentAccessModifiers: false
+IndentCaseBlocks: true
+IndentCaseLabels: true
+IndentExternBlock: NoIndent
+IndentGotoLabels: false
+IndentPPDirectives: AfterHash
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+InsertBraces:    true # NOTE: may lead to incorrect formatting
+InsertNewlineAtEOF: true
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+LambdaBodyIndentation: Signature
+LineEnding: LF
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Auto
+ObjCBlockIndentWidth: 4
+ObjCSpaceAfterProperty: true
+ObjCSpaceBeforeProtocolList: true
+PPIndentWidth: -1
+PackConstructorInitializers: CurrentLine
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Middle
+QualifierAlignment: Left
+#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict']
+RawStringFormats:
+  - Language:        Cpp
+    Delimiters:
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+ReferenceAlignment: Middle
+ReflowComments:  false # IndentOnly
+SeparateDefinitionBlocks: Always
+SortIncludes:    CaseInsensitive
+SortUsingDeclarations: LexicographicNumeric
+SpaceAfterCStyleCast: true
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyBlock: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles:  Never
+SpacesInContainerLiterals: true
+SpacesInLineCommentPrefix:
+  Minimum: 1
+  Maximum: -1
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+SpaceBeforeSquareBrackets: false
+Standard:        c++20
+TabWidth:        4
+UseTab:          Never
+WhitespaceSensitiveMacros: ['STRINGIZE']
+...
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8fe3267..b58895c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,13 +42,7 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
 
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 
-if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
-    set(TTS_STANDALONE ON)
-
-    include(git-vars)
-else()
-    set(TTS_STANDALONE OFF)
-endif()
+include(git-vars)
 
 option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
 
@@ -82,10 +76,8 @@ set(GGML_FATAL_WARNINGS     ${TTS_FATAL_WARNINGS})
 
 
 # build lib
-if (NOT TARGET ggml)
-    add_subdirectory(ggml)
-    # ... otherwise assume ggml is added by a parent CMakeLists.txt
-endif()
+add_subdirectory(ggml)
+add_subdirectory(ggml-patches)
 add_subdirectory(src)
 
 # install tts
diff --git a/examples/cli/cli.cpp b/examples/cli/cli.cpp
index 103f216..1bb454a 100644
--- a/examples/cli/cli.cpp
+++ b/examples/cli/cli.cpp
@@ -1,10 +1,12 @@
-#include "tts.h"
+#include <thread>
+
+#include "../../src/models/loaders.h"
 #include "args.h"
 #include "common.h"
+#include "ggml.h"
 #include "playback.h"
 #include "vad.h"
 #include "write_file.h"
-#include <thread>
 
 class tts_timing_printer {
     const int64_t start_us{[] {
@@ -64,24 +66,24 @@ int main(int argc, const char ** argv) {
         exit(1);
     }
 
-    generation_configuration * config = new generation_configuration(
-        args.get_string_param("--voice"), 
-        *args.get_int_param("--topk"), 
-        *args.get_float_param("--temperature"), 
-        *args.get_float_param("--repetition-penalty"), 
+    const generation_configuration config{
+        args.get_string_param("--voice"),
+        *args.get_int_param("--topk"),
+        *args.get_float_param("--temperature"),
+        *args.get_float_param("--repetition-penalty"),
         !args.get_bool_param("--no-cross-attn"),
         args.get_string_param("--espeak-voice-id"),
         *args.get_int_param("--max-tokens"),
-        *args.get_float_param("--top-p"));
+        *args.get_float_param("--top-p")};
 
-    struct tts_runner * runner = runner_from_file(args.get_string_param("--model-path"), *args.get_int_param("--n-threads"), config, !args.get_bool_param("--use-metal"));
+    unique_ptr<tts_generation_runner> runner{runner_from_file(args.get_string_param("--model-path").c_str(), *args.get_int_param("--n-threads"), config, !args.get_bool_param("--use-metal"))};
 
-    if (conditional_prompt.size() > 0) {
-        update_conditional_prompt(runner, text_encoder_path, conditional_prompt, true);
+    if (!conditional_prompt.empty()) {
+        runner->update_conditional_prompt(text_encoder_path.c_str(), conditional_prompt.c_str());
     }
     tts_response data;
 
-    generate(runner, args.get_string_param("--prompt"), &data, config);
+    runner->generate(args.get_string_param("--prompt").c_str(), data, config);
     if (data.n_outputs == 0) {
         fprintf(stderr, "Got empty response for prompt, '%s'.\n", args.get_string_param("--prompt").c_str());
         exit(1);
@@ -92,5 +94,6 @@ int main(int argc, const char ** argv) {
     if (!play_tts_response(args, data, runner->sampling_rate)) {
         write_audio_file(data, args.get_string_param("--save-path"), runner->sampling_rate);
     }
+    static_cast<void>(!runner.release()); // TODO the destructor doesn't work yet
     return 0;
 }
diff --git a/examples/perf_battery/perf_battery.cpp b/examples/perf_battery/perf_battery.cpp
index 36d0cbc..c8a1c62 100644
--- a/examples/perf_battery/perf_battery.cpp
+++ b/examples/perf_battery/perf_battery.cpp
@@ -1,16 +1,12 @@
-#include "tts.h"
-#include "args.h"
-#include "common.h"
 #include <stdio.h>
+
 #include <chrono>
 #include <functional>
 #include <thread>
 
-
-std::vector<std::string> ARCH_LOOKUP = {
-	"parler-tts",
-	"kokoro",
-};
+#include "../../src/models/loaders.h"
+#include "args.h"
+#include "common.h"
 
 using perf_cb = std::function<void()>;
 
@@ -67,15 +63,14 @@ double mean(std::vector<double> series) {
 	return (double) sum / series.size();
 }
 
-std::string benchmark_printout(tts_arch arch, std::vector<double> generation_samples, std::vector<double> output_times) {
-	std::string arch_name = ARCH_LOOKUP[(int)arch];
+std::string benchmark_printout(const char * arch, std::vector<double> generation_samples, std::vector<double> output_times) {
 	double gen_mean = mean(generation_samples);
 	std::vector<double> gen_output;
 	for (int i = 0; i < (int) output_times.size(); i++) {
 		gen_output.push_back(generation_samples[i]/output_times[i]);
 	}
 	double gen_out_mean = mean(gen_output);
-	std::string printout = (std::string) "Mean Stats for arch " + arch_name + ":\n\n" + (std::string) "  Generation Time (ms):             " +  std::to_string(gen_mean) + (std::string) "\n";
+	std::string printout = (std::string) "Mean Stats for arch " + arch + ":\n\n" + (std::string) "  Generation Time (ms):             " +  std::to_string(gen_mean) + (std::string) "\n";
 	printout += (std::string) "  Generation Real Time Factor (ms): " + std::to_string(gen_out_mean) + (std::string) "\n";
 	return printout;
 }
@@ -102,22 +97,23 @@ int main(int argc, const char ** argv) {
     }
     args.validate();
 
-    generation_configuration * config = new generation_configuration(args.get_string_param("--voice"), *args.get_int_param("--topk"), *args.get_float_param("--temperature"), *args.get_float_param("--repetition-penalty"), !args.get_bool_param("--no-cross-attn"));
+    const generation_configuration config{args.get_string_param("--voice"), *args.get_int_param("--topk"), *args.get_float_param("--temperature"), *args.get_float_param("--repetition-penalty"), !args.get_bool_param("--no-cross-attn")};
 
-    struct tts_runner * runner = runner_from_file(args.get_string_param("--model-path"), *args.get_int_param("--n-threads"), config, !args.get_bool_param("--use-metal"));
+    unique_ptr<tts_generation_runner> runner{runner_from_file(args.get_string_param("--model-path").c_str(), *args.get_int_param("--n-threads"), config, !args.get_bool_param("--use-metal"))};
     std::vector<double> generation_samples;
     std::vector<double> output_times;
     
     for (std::string sentence : TEST_SENTENCES) {
     	tts_response response;
     	perf_cb cb = [&]{
-    		generate(runner, sentence, &response, config);
+    		runner->generate(sentence.c_str(), response, config);
     	};
     	double generation_ms = benchmark_ms(cb);
     	output_times.push_back((double)(response.n_outputs / 44.1));
     	generation_samples.push_back(generation_ms);
     }
 
-    fprintf(stdout, "%s", benchmark_printout(runner->arch, generation_samples, output_times).c_str());
+    fprintf(stdout, "%s", benchmark_printout(runner->loader.get().arch, generation_samples, output_times).c_str());
+    static_cast<void>(!runner.release()); // TODO the destructor doesn't work yet
 	return 0;
 }
diff --git a/examples/phonemize/phonemize.cpp b/examples/phonemize/phonemize.cpp
index 83d551d..636e6f4 100644
--- a/examples/phonemize/phonemize.cpp
+++ b/examples/phonemize/phonemize.cpp
@@ -1,7 +1,8 @@
-#include "phonemizer.h"
-#include "args.h"
 #include <stdio.h>
 
+#include "../../src/models/kokoro/phonemizer.h"
+#include "args.h"
+
 int main(int argc, const char ** argv) {
     arg_list args;
     args.add_argument(string_arg("--phonemizer-path", "(OPTIONAL) The local path of the gguf phonemiser file for TTS.cpp phonemizer. This is required if not using espeak.", "-mp"));
diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt
index fda21c8..254a619 100644
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@@ -1,2 +1,6 @@
-add_executable(quantize quantize.cpp)
+add_executable(quantize
+        quantize.cpp
+        quantize_impl.cpp
+        quantize_impl.h
+)
 target_link_libraries(quantize PRIVATE ggml tts)
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 2cad888..ffb41f8 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -1,11 +1,12 @@
-#include <stdio.h>
-#include <thread>
+#include <cstdio>
 #include <map>
+#include <thread>
 #include <vector>
 
+#include "../../src/models/loaders.h"
 #include "args.h"
 #include "ggml.h"
-#include "tts.h"
+#include "quantize_impl.h"
 
 const std::map<std::string, ggml_type> valid_quantization_types = {
     {"FP16", GGML_TYPE_F16},
@@ -42,12 +43,15 @@ int main(int argc, const char ** argv) {
                 qtype.c_str());
         exit(1);
     }
-    struct quantization_params * qp = new quantization_params((uint32_t) *args.get_int_param("--n-threads"), valid_quantization_types.at(qtype));
-    qp->quantize_output_heads = args.get_bool_param("--quantize-output-heads");
-    qp->quantize_text_embeddings = args.get_bool_param("--quantize-text-embedding");
-    qp->quantize_cross_attn_kv = args.get_bool_param("--quantize-cross-attn-kv");
-    qp->convert_dac_to_f16 = args.get_bool_param("--convert-dac-to-f16");
-    qp->convert_non_quantizable_to_f16 = args.get_bool_param("--convert-non-quantized-to-f16");
-  	quantize_gguf(args.get_string_param("--model-path"), args.get_string_param("--quantized-model-path"), qp);
+    quantization_params qp {
+        .n_threads{ static_cast<uint32_t>(*args.get_int_param("--n-threads")) },
+        .quantize_type{valid_quantization_types.at(qtype)},  // quantization type
+        .quantize_output_heads{ args.get_bool_param("--quantize-output-heads")},
+        .quantize_text_embeddings{args.get_bool_param("--quantize-text-embedding")},
+        .quantize_cross_attn_kv{ args.get_bool_param("--quantize-cross-attn-kv")},
+        .convert_dac_to_f16{ args.get_bool_param("--convert-dac-to-f16")},
+        .convert_non_quantizable_to_f16{ args.get_bool_param("--convert-non-quantized-to-f16")},
+    };
+    quantize_gguf(args.get_string_param("--model-path").c_str(), args.get_string_param("--quantized-model-path").c_str(), qp);
     return 0;
 }
diff --git a/examples/quantize/quantize_impl.cpp b/examples/quantize/quantize_impl.cpp
new file mode 100644
index 0000000..5dce2fa
--- /dev/null
+++ b/examples/quantize/quantize_impl.cpp
@@ -0,0 +1,293 @@
+#include "quantize_impl.h"
+
+#include <array>
+#include <fstream>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "common.h"
+#include "ggml-cpp.h"
+#include "util.h"
+
+static bool kokoro_is_f16_compatible(std::string_view name) {
+    return name.find("voice_tensors") == std::string::npos && name.find("bias") == std::string::npos &&
+           name.find("gamma") == std::string::npos && name.find("beta") == std::string::npos &&
+           name.find("alpha") == std::string::npos && !name.ends_with("embd") && !name.ends_with("norm");
+}
+
+static bool kokoro_is_quantizable(const std::string & name, const quantization_params & params) {
+    // A list of all of the top level GGUF names under kokoro.duration_predictor that have quantization compatible tensors.
+    static constexpr std::array<std::string_view, 5> DURATION_PREDICTOR_QUANTIZATION_COMPATIBLE_PARTS = {
+        "duration_proj", "encode", "shared_lstm", "duration_lstm", "layers"
+    };
+
+    if (kokoro_is_f16_compatible(name)) {
+        if (name.starts_with("kokoro.albert") || name.starts_with("kokoro.text_encoder.lstm")) {
+            return true;
+        }
+        if (name.starts_with("kokoro.duration_predictor.")) {
+            std::vector<std::string> parts = split(name, ".");
+            for (const auto part : DURATION_PREDICTOR_QUANTIZATION_COMPATIBLE_PARTS) {
+                if (part == parts[2]) {
+                    return true;
+                }
+            }
+        }
+    }
+    return false;
+}
+
+static bool dia_is_quantizable(std::string_view name, const quantization_params & params) {
+    // The DAC audio encoder / decoder is not compatible with quantization and normalization tensors should not be quantized.
+    bool quantizable = !name.starts_with("audio_encoder") && !name.ends_with("norm");
+    if (!params.quantize_output_heads) {
+        quantizable = quantizable && !name.starts_with("dia.decoder.heads");
+    }
+    return quantizable;
+}
+
+static bool parler_is_quanitizable(std::string_view name, const quantization_params & params) {
+    // the DAC audio encoder / decoder is not compatible with quantization, normalization weight shouldn't be quantized, and the text encoding shouldn't be normalized.
+    bool quantizable = !name.starts_with("audio_encoder") && !name.ends_with("norm.weight") &&
+                       !name.ends_with("text_encoding") && !name.ends_with("positional_embed") &&
+                       !name.ends_with("norm.bias");
+    if (!params.quantize_output_heads) {
+        quantizable = quantizable && !name.ends_with("weight.head");
+    }
+    if (!params.quantize_text_embeddings) {
+        quantizable = quantizable && !name.ends_with("embed_prompts");
+    }
+    if (!params.quantize_cross_attn_kv) {
+        quantizable = quantizable && !name.ends_with("encoder_attn.k_proj.weight") &&
+                      !name.ends_with("encoder_attn.v_proj.weight");
+    }
+    return quantizable;
+}
+
+static bool is_quantizable(tts_arch arch, const std::string & name, const quantization_params & params) {
+    switch (arch) {
+        case PARLER_TTS_ARCH:
+            return parler_is_quanitizable(name, params);
+        case DIA_ARCH:
+            return dia_is_quantizable(name, params);
+        case KOKORO_ARCH:
+            return kokoro_is_quantizable(name, params);
+        default:
+            GGML_ABORT("%s failed. The architecture '%d' is not supported.", __func__, arch);
+    }
+}
+
+static size_t quantize_tensor(void * new_data, const ggml_tensor * tensor, const float * imatrix, ggml_type qtype,
+                              uint32_t n_threads) {
+    // much of this is form copied from llama.cpp
+    int chunk_size_multiplier = 1;
+    if (qtype == GGML_TYPE_Q4_0_4_4 || qtype == GGML_TYPE_Q4_0_4_8 || qtype == GGML_TYPE_Q4_0_8_8) {
+        if ((qtype == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0) || tensor->ne[1] % 4 != 0) {
+            qtype = GGML_TYPE_Q4_0;
+        }
+        if (qtype == GGML_TYPE_Q4_0_8_8) {
+            chunk_size_multiplier = 8;
+        } else if (qtype == GGML_TYPE_Q4_0_4_4 || qtype == GGML_TYPE_Q4_0_4_8) {
+            chunk_size_multiplier = 4;
+        }
+    }
+    size_t                   out_size       = 0;
+    const int32_t            d3_step        = tensor->ne[0] * tensor->ne[1];
+    const int32_t            n_per_row      = tensor->ne[0];
+    const int32_t            nrows          = tensor->ne[1];
+    static constexpr int32_t min_chunk_size = 32 * 512;
+    const int32_t            chunk_size =
+        (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1) / n_per_row)) *
+        chunk_size_multiplier;
+    uint32_t thread_count =
+        std::max(1, std::min(static_cast<int>(n_threads), (int) (d3_step + chunk_size - 1) / chunk_size));
+    std::mutex mutex;
+
+    for (int32_t d3_index = 0; d3_index < tensor->ne[2]; d3_index++) {
+        const float * f32_data_d3 = static_cast<float *>(tensor->data) + d3_index * d3_step;
+        void * new_data_d3 = static_cast<char *>(new_data) + ggml_row_size(qtype, tensor->ne[0]) * d3_index * nrows;
+        // const float * imatrix_03  = imatrix ? imatrix + d3_index * tensor->ne[0] : nullptr;
+        if (thread_count <= 1) {
+            // not threaded
+            out_size += ggml_quantize_chunk(qtype, f32_data_d3, new_data_d3, 0, nrows, n_per_row, imatrix);
+        } else {
+            std::vector<std::thread> threads;
+            int64_t                  counter  = 0;
+            size_t                   new_size = 0;
+            bool                     valid    = true;
+            for (uint32_t t = 0; t < thread_count; t++) {
+                auto func = [&mutex, &counter, &new_size, &valid, qtype, f32_data_d3, new_data_d3, chunk_size, nrows,
+                             n_per_row, imatrix]() {
+                    const int64_t nrows_per_chunk = chunk_size / n_per_row;
+                    size_t        local_size      = 0;
+                    while (true) {
+                        std::unique_lock<std::mutex> lock(mutex);
+                        int64_t                      first_row = counter;
+                        counter += nrows_per_chunk;
+                        if (first_row >= nrows) {
+                            if (local_size > 0) {
+                                new_size += local_size;
+                            }
+                            break;
+                        }
+                        lock.unlock();
+                        const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
+                        const size_t  this_size = ggml_quantize_chunk(
+                            qtype, f32_data_d3, new_data_d3, first_row * n_per_row, this_nrow, n_per_row, imatrix);
+                        local_size += this_size;
+
+                        // validate the quantized data; I am not sure how this would occur, but there is always the safe fallback on doing this single threaded.
+                        const size_t row_size  = ggml_row_size(qtype, n_per_row);
+                        void *       this_data = static_cast<char *>(new_data_d3) + first_row * row_size;
+                        if (!ggml_validate_row_data(qtype, this_data, this_size)) {
+                            std::unique_lock<std::mutex> lock(mutex);
+                            valid = false;
+                            break;
+                        }
+                    }
+                };
+                threads.emplace_back(func);
+            }
+            for (auto & t : threads) {
+                t.join();
+            }
+
+            if (!valid) {
+                GGML_ABORT(
+                    "Validation of quantized data failed. Please try again and/or switch to single thread "
+                    "quantization.\n");
+            }
+            out_size += new_size;
+        }
+    }
+    return out_size;
+}
+
+static void zeros(std::ofstream & file, size_t n) {
+    char zero = 0;
+    for (size_t i = 0; i < n; ++i) {
+        file.write(&zero, 1);
+    }
+}
+
+template <typename T> struct no_init {
+    T value;
+
+    no_init() { /* do nothing */ }
+};
+
+void quantize_gguf(const char * ifile, const char * ofile, const quantization_params & params) {
+    ggml_context *   weight_ctx{};
+    gguf_init_params gguf_params{
+        .no_alloc{ false },
+        .ctx{ &weight_ctx },
+    };
+    gguf_context * meta_ctx = gguf_init_from_file(ifile, gguf_params);
+    std::string    arch     = "parler-tts";  // only parler-tts gguf files should lack an explicit architecture.
+
+    if (int arch_key = gguf_find_key(meta_ctx, "general.architecture"); arch_key != -1) {
+        arch = std::string(gguf_get_val_str(meta_ctx, arch_key));
+    }
+    tts_arch arch_type = SUPPORTED_ARCHITECTURES.at(arch);
+
+    if (params.quantize_type != GGML_TYPE_Q5_0 && params.quantize_type != GGML_TYPE_Q8_0 &&
+        params.quantize_type != GGML_TYPE_F16 && params.quantize_type != GGML_TYPE_Q4_0) {
+        fprintf(stdout, "Warning, %s is untested for quantization type '%d'. Use at your own risk.\n", arch.c_str(),
+                params.quantize_type);
+    }
+
+    gguf_context_ptr ctx_out{ gguf_init_empty() };
+
+    // copy the KV pairs from the input file
+    gguf_set_kv(ctx_out.get(), meta_ctx);
+    gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION);
+    gguf_set_val_u32(ctx_out.get(), "general.quantization_type", params.quantize_type);
+    for (ggml_tensor * tensor = ggml_get_first_tensor(weight_ctx); tensor;
+         tensor               = ggml_get_next_tensor(weight_ctx, tensor)) {
+        if (*ggml_get_name(tensor)) {
+            gguf_add_tensor(ctx_out.get(), tensor);
+        }
+    }
+
+    std::vector<no_init<uint8_t>> work;
+
+    std::ofstream fout;
+    auto          close_ofstream = [&]() {
+        // Write metadata and close file handler
+        if (fout.is_open()) {
+            fout.seekp(0);
+            std::vector<uint8_t> data(gguf_get_meta_size(ctx_out.get()));
+            gguf_get_meta_data(ctx_out.get(), data.data());
+            fout.write(reinterpret_cast<const char *>(data.data()), data.size());
+            fout.close();
+        }
+    };
+    auto new_ofstream = [&]() {
+        std::string fname = ofile;
+        fout              = std::ofstream(fname, std::ios::binary);
+        fout.exceptions(std::ofstream::failbit);  // fail fast on write errors
+        const size_t meta_size = gguf_get_meta_size(ctx_out.get());
+        // placeholder for the meta data
+        ::zeros(fout, meta_size);
+    };
+    new_ofstream();
+    for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) {
+        ggml_type              new_type;
+        void *                 new_data;
+        size_t                 new_size;
+        const char * const     name = ggml_get_name(cur);
+        const std::string_view name_sv{ name };
+
+        if (!*name) {
+            continue;
+        }
+
+        if (is_quantizable(arch_type, name, params)) {
+            if ((cur->type) != GGML_TYPE_F32) {
+                GGML_ABORT(
+                    "ERROR: All quantized tensors must be transformed from 32bit floats. Tensor, '%s', has improper "
+                    "type, '%d'\n",
+                    cur->name, cur->type);
+            }
+            new_type = params.quantize_type;
+            if ((new_type >= GGML_TYPE_IQ2_XXS && new_type <= GGML_TYPE_IQ4_XS)) {
+                GGML_ABORT("ERROR: Quantization type '%d' requires an importance matrix.\n", new_type);
+            }
+            const int64_t nelement_size = ggml_nelements(cur) * 4;
+            if (work.size() < static_cast<size_t>(nelement_size)) {
+                work.resize(nelement_size);  // upper bound on size
+            }
+            new_data = work.data();
+            new_size = quantize_tensor(new_data, cur, nullptr, new_type, params.n_threads);
+        } else if ((params.convert_non_quantizable_to_f16 && kokoro_is_f16_compatible(name)) ||
+                   (params.convert_dac_to_f16 && name_sv.starts_with("audio_encoder") && !name_sv.ends_with("alpha"))) {
+            if ((cur->type) != GGML_TYPE_F32) {
+                GGML_ABORT(
+                    "ERROR: All converted tensors must be transformed from 32bit floats. Tensor, '%s', has improper "
+                    "type, '%d'\n",
+                    cur->name, cur->type);
+            }
+            new_type                    = GGML_TYPE_F16;
+            const int64_t nelement_size = ggml_nelements(cur) * 4;
+            if (work.size() < static_cast<size_t>(nelement_size)) {
+                work.resize(nelement_size);  // upper bound on size
+            }
+            new_data = work.data();
+            new_size = quantize_tensor(new_data, cur, nullptr, new_type, params.n_threads);
+        } else {
+            new_type = cur->type;
+            new_data = cur->data;
+            new_size = ggml_nbytes(cur);
+        }
+
+        gguf_set_tensor_type(ctx_out.get(), name, new_type);
+        gguf_set_tensor_data(ctx_out.get(), name, new_data, new_size);
+        fprintf(stdout, "At tensor: '%s' with new size: %zu bytes\n", name, new_size);
+        // write tensor data + padding
+        fout.write(static_cast<const char *>(new_data), new_size);
+        zeros(fout, GGML_PAD(new_size, GGUF_DEFAULT_ALIGNMENT) - new_size);
+    }
+    close_ofstream();
+}
diff --git a/examples/quantize/quantize_impl.h b/examples/quantize/quantize_impl.h
new file mode 100644
index 0000000..4c99eaf
--- /dev/null
+++ b/examples/quantize/quantize_impl.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "ggml.h"
+
+struct quantization_params {
+    uint32_t n_threads;
+    ggml_type quantize_type; // quantization type
+    bool quantize_output_heads;
+    bool quantize_text_embeddings;
+    bool quantize_cross_attn_kv;
+    bool convert_dac_to_f16;
+    bool convert_non_quantizable_to_f16;
+};
+
+void quantize_gguf(const char * ifile, const char * ofile, const quantization_params & params);
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 10c7c0d..bf9bd88 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -12,26 +12,27 @@
 #define MIMETYPE_JSON "application/json; charset=utf-8"
 #define MIMETYPE_HTML "text/html; charset=utf-8"
 
+#include <signal.h>
+
 #include <atomic>
+#include <chrono>
+#include <cinttypes>
 #include <condition_variable>
 #include <cstddef>
-#include <cinttypes>
 #include <deque>
+#include <filesystem>
 #include <memory>
 #include <mutex>
-#include <signal.h>
 #include <thread>
 #include <unordered_map>
-#include <filesystem>
 #include <unordered_set>
-#include <chrono>
-#include "tts.h"
-#include "audio_file.h"
+
+#include "../../src/models/loaders.h"
 #include "args.h"
+#include "audio_file.h"
 #include "common.h"
-#include "tts_server_threading_osx.h"
-
 #include "index.html.hpp"
+#include "tts_server_threading_osx.h"
 
 enum server_state {
     LOADING,  // Server is starting up / model loading
@@ -106,7 +107,7 @@ struct simple_server_task {
     task_type task;
     int id;
     std::string prompt;
-    generation_configuration * gen_config;
+    generation_configuration gen_config;
     void * response;
     size_t length;
     bool success = false;
@@ -224,14 +225,15 @@ void init_response_map(simple_response_map * rmap) {
 struct worker {
     worker(struct simple_task_queue * task_queue, struct simple_response_map * response_map, std::string text_encoder_path = "", int task_timeout = 300): task_queue(task_queue), response_map(response_map), text_encoder_path(text_encoder_path), task_timeout(task_timeout) {};
     ~worker() {
-        for (auto &[_, runner]: runners) {
-            delete runner;
+        // runners.clear();
+        for (auto & runner : views::values(runners)) {
+            static_cast<void>(!runner.release()); // TODO the destructor doesn't work yet
         }
     }
     struct simple_task_queue * task_queue;
     struct simple_response_map * response_map;
 
-    std::unordered_map<std::string, struct tts_runner *> runners;
+    unordered_map<string, unique_ptr<tts_generation_runner>> runners{};
     std::string text_encoder_path;
     std::atomic<bool> running = true;
     tts_server_threading::native_thread * thread = nullptr;
@@ -255,17 +257,16 @@ struct worker {
         if (task->timed_out(task_timeout)) {
             return;
         }
-        int outcome;
         tts_response * data = nullptr;
-        tts_runner* runner = runners[task->model];
+        tts_generation_runner & runner{*runners[task->model]};
         switch(task->task) {
             case TTS:
-                data = new tts_response;
-                outcome = generate(runner, task->prompt, data, task->gen_config);
-                task->response = (void*) data->data;
-                task->length = data->n_outputs;
-                task->sample_rate = runner->sampling_rate;
-                task->success = outcome == 0;
+                data              = new tts_response;
+                runner.generate(task->prompt.c_str(), *data, task->gen_config);
+                task->response    = (void *) data->data;
+                task->length      = data->n_outputs;
+                task->sample_rate = runner.sampling_rate;
+                task->success     = data->n_outputs != 0;
                 response_map->push(task);
                 break;
             case CONDITIONAL_PROMPT:
@@ -274,7 +275,7 @@ struct worker {
                     response_map->push(task);
                     break;
                 }
-                update_conditional_prompt(runner, text_encoder_path, task->prompt);
+                runner.update_conditional_prompt(text_encoder_path.c_str(), task->prompt.c_str());
                 task->success = true;
                 response_map->push(task);
                 break;
@@ -287,8 +288,8 @@ struct worker {
                     if (!runner->supports_voices) {
                         continue;
                     }
-                    std::string voices_string = "";
-                    for (auto voice : list_voices(runner)) {
+                    std::string voices_string{};
+                    for (const auto voice : runner->list_voices()) {
                         if (!voices_string.empty()) {
                             voices_string += ",";
                         }
@@ -312,9 +313,9 @@ struct worker {
     }
 };
 
-void init_worker(std::unordered_map<std::string, std::string>* model_path, int n_threads, bool cpu_only, generation_configuration * config, worker * w) {
+void init_worker(std::unordered_map<std::string, std::string>* model_path, int n_threads, bool cpu_only, const generation_configuration & config, worker * w) {
     for (const auto &[id, path] : *model_path) {
-        w->runners[id] = runner_from_file(path, n_threads, config, cpu_only);
+        w->runners[id] = runner_from_file(path.c_str(), n_threads, config, cpu_only);
     }
     w->loop();
 }
@@ -444,7 +445,7 @@ int main(int argc, const char ** argv) {
         exit(1);
     }
 
-    generation_configuration * default_generation_config = new generation_configuration(
+    const generation_configuration default_generation_config{
         args.get_string_param("--voice"),
         *args.get_int_param("--topk"),
         *args.get_float_param("--temperature"),
@@ -452,7 +453,7 @@ int main(int argc, const char ** argv) {
         !args.get_bool_param("--no-cross-attn"),
         args.get_string_param("--espeak-voice-id"),
         0,
-        *args.get_float_param("--top-p"));
+        *args.get_float_param("--top-p")};
 
     worker_pool * pool = nullptr;
     struct simple_task_queue * tqueue = new simple_task_queue;
@@ -651,34 +652,33 @@ int main(int argc, const char ** argv) {
         }
         struct simple_server_task * task = new simple_server_task(TTS, prompt);
         int id = task->id;
-        generation_configuration * conf = new generation_configuration();
-        std::memcpy((void*)conf, default_generation_config, sizeof(generation_configuration));
+        generation_configuration conf{default_generation_config};
         float temp;
         float rep_pen;
         float top_p;
         int top_k;
         if (data.contains("temperature") && data.at("temperature").is_number()) {
             temp = data.at("temperature").get<float>();
-            conf->temperature = temp;
+            conf.temperature = temp;
         }
 
         if (data.contains("top_k") && data.at("top_k").is_number()) {
             top_k = data.at("top_k").get<int>();
-            conf->top_k = top_k;
+            conf.top_k = top_k;
         }
 
         if (data.contains("top_p") && data.at("top_p").is_number()) {
             top_p = data.at("top_p").get<float>();
-            conf->top_p = top_p;
+            conf.top_p = top_p;
         }
 
         if (data.contains("repetition_penalty") && data.at("repetition_penalty").is_number()) {
             rep_pen = data.at("repetition_penalty").get<float>();
-            conf->repetition_penalty = rep_pen;
+            conf.repetition_penalty = rep_pen;
         }
 
         if (data.contains("voice") && data.at("voice").is_string()) {
-            conf->voice = data.at("voice").get<std::string>();
+            conf.voice = data.at("voice").get<std::string>();
         }
 
         if (data.contains("model") && data.at("model").is_string()) {
diff --git a/ggml b/ggml
index 136da02..70ba160 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit 136da02ac32d5011cf9b46b117a0ea1be24e2bad
+Subproject commit 70ba16054447fb613d7c4fba76eeaf9ec0bfbeab
diff --git a/ggml-patches/.clang-format-ignore b/ggml-patches/.clang-format-ignore
new file mode 100644
index 0000000..42dea25
--- /dev/null
+++ b/ggml-patches/.clang-format-ignore
@@ -0,0 +1,2 @@
+llama-mmap.cpp
+llama-mmap.h
diff --git a/ggml-patches/CMakeLists.txt b/ggml-patches/CMakeLists.txt
new file mode 100644
index 0000000..8672efc
--- /dev/null
+++ b/ggml-patches/CMakeLists.txt
@@ -0,0 +1,6 @@
+target_sources(ggml PRIVATE
+        ggml-iterator.h
+        llama-mmap.cpp
+        llama-mmap.h
+)
+target_include_directories(ggml PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
diff --git a/ggml-patches/README.txt b/ggml-patches/README.txt
new file mode 100644
index 0000000..8923ab8
--- /dev/null
+++ b/ggml-patches/README.txt
@@ -0,0 +1 @@
+TODO: upstream this
diff --git a/ggml-patches/ggml-iterator.h b/ggml-patches/ggml-iterator.h
new file mode 100644
index 0000000..5cd87f2
--- /dev/null
+++ b/ggml-patches/ggml-iterator.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include <utility>
+
+#include "ggml.h"
+
+class gguf_key_iterator {
+    const gguf_context * const ctx;
+    const int                  n_kv;
+    int                        i{};
+
+  public:
+    explicit gguf_key_iterator(const gguf_context & ctx) : ctx{ &ctx }, n_kv{ gguf_get_n_kv(&ctx) } {}
+
+    std::pair<int, const char *> operator*() const { return { i, gguf_get_key(ctx, i) }; }
+
+    gguf_key_iterator & operator++() {
+        ++i;
+        return *this;
+    }
+
+    gguf_key_iterator begin() const {
+        auto result{ *this };
+        result.i = 0;
+        return result;
+    }
+
+    gguf_key_iterator end() const {
+        auto result{ *this };
+        result.i = n_kv;
+        return result;
+    }
+
+    bool operator==(const gguf_key_iterator &) const = default;
+};
+
+class ggml_tensor_iterator {
+    const ggml_context * const ctx;
+    ggml_tensor *              cur;
+
+  public:
+    explicit ggml_tensor_iterator(const ggml_context & ctx) : ctx{ &ctx }, cur{ ggml_get_first_tensor(&ctx) } {}
+
+    ggml_tensor & operator*() const { return *cur; }
+
+    ggml_tensor_iterator & operator++() {
+        cur = ggml_get_next_tensor(ctx, cur);
+        return *this;
+    }
+
+    ggml_tensor_iterator begin() const {
+        auto result{ *this };
+        result.cur = ggml_get_first_tensor(ctx);
+        return result;
+    }
+
+    ggml_tensor_iterator end() const {
+        auto result{ *this };
+        result.cur = nullptr;
+        return result;
+    }
+
+    bool operator==(const ggml_tensor_iterator &) const = default;
+};
diff --git a/ggml-patches/llama-mmap.cpp b/ggml-patches/llama-mmap.cpp
new file mode 100644
index 0000000..9a2f166
--- /dev/null
+++ b/ggml-patches/llama-mmap.cpp
@@ -0,0 +1,638 @@
+#include "llama-mmap.h"
+
+#include "ggml.h"
+#include "../ggml/src/ggml-impl.h"
+
+#include <cstring>
+#include <climits>
+#include <stdexcept>
+#include <cerrno>
+#include <algorithm>
+
+#ifdef __has_include
+    #if __has_include(<unistd.h>)
+        #include <unistd.h>
+        #if defined(_POSIX_MAPPED_FILES)
+            #include <sys/mman.h>
+            #include <fcntl.h>
+        #endif
+        #if defined(_POSIX_MEMLOCK_RANGE)
+            #include <sys/resource.h>
+        #endif
+    #endif
+#endif
+
+#if defined(_WIN32)
+    #define WIN32_LEAN_AND_MEAN
+    #ifndef NOMINMAX
+        #define NOMINMAX
+    #endif
+    #include <windows.h>
+    #ifndef PATH_MAX
+        #define PATH_MAX MAX_PATH
+    #endif
+    #include <io.h>
+#endif
+
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#endif
+
+// TODO: consider moving to llama-impl.h if needed in more places
+#if defined(_WIN32)
+#include <sstream>
+
+static std::string llama_format_win_err(DWORD err) {
+    LPSTR buf;
+    size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+                                 NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
+    if (!size) {
+        return "FormatMessageA failed";
+    }
+    std::string ret(buf, size);
+    LocalFree(buf);
+    return ret;
+}
+#endif
+
+#define USE_MSYNC false
+
+// llama_file
+
+struct llama_file::impl {
+#if defined(_WIN32)
+    HANDLE fp_win32;
+    std::string GetErrorMessageWin32(DWORD error_code) const {
+        std::string ret;
+        LPSTR lpMsgBuf = NULL;
+        DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+                                    NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
+        if (!bufLen) {
+            std::ostringstream ss;
+            ss << "Win32 error code: " << std::hex << error_code;
+            ret = ss.str();
+        } else {
+            ret = lpMsgBuf;
+            LocalFree(lpMsgBuf);
+        }
+
+        return ret;
+    }
+
+    impl(const char * fname, const char * mode) {
+        fp = ggml_fopen(fname, mode);
+        if (fp == NULL) {
+            throw std::runtime_error(std::string{"failed to open "} + fname + ": " + strerror(errno));
+        }
+        fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
+        seek(0, SEEK_END);
+        size = tell();
+        seek(0, SEEK_SET);
+    }
+
+    size_t tell() const {
+        LARGE_INTEGER li;
+        li.QuadPart = 0;
+        BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT);
+        if (!ret) {
+            throw std::runtime_error(std::string{"read error: "} + GetErrorMessageWin32(GetLastError()));
+        }
+
+        return li.QuadPart;
+    }
+
+    void seek(size_t offset, int whence) const {
+        static_assert(SEEK_SET == FILE_BEGIN, "SEEK_SET != FILE_BEGIN");
+        static_assert(SEEK_CUR == FILE_CURRENT, "SEEK_CUR != FILE_CURRENT");
+        static_assert(SEEK_END == FILE_END, "SEEK_END != FILE_END");
+
+        LARGE_INTEGER li;
+        li.QuadPart = offset;
+        BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence);
+        if (!ret) {
+            throw std::runtime_error(std::string{"read error: "} + GetErrorMessageWin32(GetLastError()));
+        }
+    }
+
+    void read_raw(void * ptr, size_t len) const {
+        size_t bytes_read = 0;
+        while (bytes_read < len) {
+            size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
+            DWORD chunk_read = 0;
+            BOOL result = ReadFile(fp_win32, reinterpret_cast<char*>(ptr) + bytes_read, chunk_size, &chunk_read, NULL);
+            if (!result) {
+                throw std::runtime_error(std::string{"read error: "} + GetErrorMessageWin32(GetLastError()));
+            }
+            if (chunk_read < chunk_size || chunk_read == 0) {
+                throw std::runtime_error("unexpectedly reached end of file");
+            }
+
+            bytes_read += chunk_read;
+        }
+    }
+
+    uint32_t read_u32() const {
+        uint32_t val;
+        read_raw(&val, sizeof(val));
+        return val;
+    }
+
+    void write_raw(const void * ptr, size_t len) const {
+        size_t bytes_written = 0;
+        while (bytes_written < len) {
+            size_t chunk_size = std::min<size_t>(len - bytes_written, 64*1024*1024);
+            DWORD chunk_written = 0;
+            BOOL result = WriteFile(fp_win32, reinterpret_cast<char const*>(ptr) + bytes_written, chunk_size, &chunk_written, NULL);
+            if (!result) {
+                throw std::runtime_error(std::string{"write error: "} + GetErrorMessageWin32(GetLastError()));
+            }
+            if (chunk_written < chunk_size || chunk_written == 0) {
+                throw std::runtime_error("unexpectedly failed to write bytes");
+            }
+
+            bytes_written += chunk_written;
+        }
+    }
+
+    void write_u32(uint32_t val) const {
+        write_raw(&val, sizeof(val));
+    }
+
+    ~impl() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+#else
+    impl(const char * fname, const char * mode) {
+        fp = ggml_fopen(fname, mode);
+        if (fp == NULL) {
+            throw std::runtime_error(std::string{"failed to open "} + fname + ": " + strerror(errno));
+        }
+        seek(0, SEEK_END);
+        size = tell();
+        seek(0, SEEK_SET);
+    }
+
+    size_t tell() const {
+// TODO: this ifdef is never true?
+#ifdef _WIN32
+        __int64 ret = _ftelli64(fp);
+#else
+        long ret = std::ftell(fp);
+#endif
+        if (ret == -1) {
+            throw std::runtime_error(std::string{"ftell error: "} + strerror(errno));
+        }
+
+        return (size_t) ret;
+    }
+
+    void seek(size_t offset, int whence) const {
+// TODO: this ifdef is never true?
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+        int ret = std::fseek(fp, (long) offset, whence);
+#endif
+        if (ret != 0) {
+            throw std::runtime_error(std::string{"seek error: "} + strerror(errno));
+        }
+    }
+
+    void read_raw(void * ptr, size_t len) const {
+        if (len == 0) {
+            return;
+        }
+        errno = 0;
+        std::size_t ret = std::fread(ptr, len, 1, fp);
+        if (ferror(fp)) {
+            throw std::runtime_error(std::string{"read error: "} + strerror(errno));
+        }
+        if (ret != 1) {
+            throw std::runtime_error("unexpectedly reached end of file");
+        }
+    }
+
+    uint32_t read_u32() const {
+        uint32_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+
+    void write_raw(const void * ptr, size_t len) const {
+        if (len == 0) {
+            return;
+        }
+        errno = 0;
+        size_t ret = std::fwrite(ptr, len, 1, fp);
+        if (ret != 1) {
+            throw std::runtime_error(std::string{"write error: "} + strerror(errno));
+        }
+    }
+
+    void write_u32(uint32_t val) const {
+        write_raw(&val, sizeof(val));
+    }
+
+    ~impl() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+#endif
+
+    FILE * fp;
+    size_t size;
+};
+
+llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique<impl>(fname, mode)) {}
+llama_file::~llama_file() = default;
+
+size_t llama_file::tell() const { return pimpl->tell(); }
+size_t llama_file::size() const { return pimpl->size; }
+
+int llama_file::file_id() const {
+#ifdef _WIN32
+    return _fileno(pimpl->fp);
+#else
+#if defined(fileno)
+    return fileno(pimpl->fp);
+#else
+    return ::fileno(pimpl->fp);
+#endif
+#endif
+}
+
+void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
+void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
+
+uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
+
+void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); }
+void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); }
+
+// llama_mmap
+
+struct llama_mmap::impl {
+#ifdef _POSIX_MAPPED_FILES
+    std::vector<std::pair<size_t, size_t>> mapped_fragments;
+
+    impl(struct llama_file * file, size_t prefetch, bool numa, bool writable) : writable{ writable } {
+        size = file->size();
+        int fd = file->file_id();
+        int flags = MAP_SHARED;
+        if (numa) { prefetch = 0; }
+#ifdef __linux__
+        if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
+            GGML_LOG_WARN("warning: posix_fadvise(.., POSIX_FADV_SEQUENTIAL) failed: %s\n",
+                    strerror(errno));
+        }
+#endif
+        addr = mmap(NULL, file->size(), PROT_READ | (writable ? PROT_WRITE : 0), flags, fd, 0);
+        if (addr == MAP_FAILED) {
+            throw std::runtime_error(std::string{"mmap failed: "} + strerror(errno));
+        }
+
+        prefetch = std::min(file->size(), prefetch == ~0U ? file->size() : prefetch);
+        // if (madvise(addr, file->size(), MADV_HUGEPAGE)) { // Still does nothing
+        //     GGML_LOG_WARN("warning: madvise(.., MADV_HUGEPAGE) failed: %s\n",
+        //             strerror(errno));
+        // }
+        if (prefetch && !writable) {
+            // MADV_POPULATE_WRITE is a pessimization
+#ifdef __linux__
+            if (madvise(addr, prefetch, MADV_POPULATE_READ)) {
+                GGML_LOG_WARN("warning: madvise(.., MADV_POPULATE_READ) failed: %s\n",
+                        strerror(errno));
+            }
+#else
+            if (posix_madvise(addr, prefetch, POSIX_MADV_WILLNEED)) {
+                GGML_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
+                        strerror(errno));
+            }
+#endif
+        }
+        if (numa) {
+            if (posix_madvise(addr, file->size(), POSIX_MADV_RANDOM)) {
+                GGML_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
+                        strerror(errno));
+            }
+        }
+
+        mapped_fragments.emplace_back(0, file->size());
+    }
+
+    static void align_range(size_t * first, size_t * last, size_t page_size) {
+        size_t offset_in_page = *first & (page_size - 1);
+        size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page;
+        *first += offset_to_page;
+
+        *last = *last & ~(page_size - 1);
+
+        if (*last <= *first) {
+            *last = *first;
+        }
+    }
+
+    void unmap_fragment(size_t first, size_t last) {
+        if (writable) {
+            return;
+        }
+        int page_size = sysconf(_SC_PAGESIZE);
+        align_range(&first, &last, page_size);
+        size_t len = last - first;
+
+        if (len == 0) {
+            return;
+        }
+
+        GGML_ASSERT(first % page_size == 0);
+        GGML_ASSERT(last % page_size == 0);
+        GGML_ASSERT(last > first);
+
+        void * next_page_start = (uint8_t *) addr + first;
+
+        if (munmap(next_page_start, len)) {
+            GGML_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
+        }
+
+        std::vector<std::pair<size_t, size_t>> new_mapped_fragments;
+        for (const auto & frag : mapped_fragments) {
+            if (frag.first < first && frag.second > last) {
+                new_mapped_fragments.emplace_back(frag.first, first);
+                new_mapped_fragments.emplace_back(last, frag.second);
+            } else if (frag.first < first && frag.second > first) {
+                new_mapped_fragments.emplace_back(frag.first, first);
+            } else if (frag.first < last && frag.second > last) {
+                new_mapped_fragments.emplace_back(last, frag.second);
+            } else if (frag.first >= first && frag.second <= last) {
+            } else {
+                new_mapped_fragments.push_back(frag);
+            }
+        }
+        mapped_fragments = std::move(new_mapped_fragments);
+    }
+
+    ~impl() {
+        for (const auto & frag : mapped_fragments) {
+            if (writable) {
+                if (msync((char *) addr + frag.first, frag.second - frag.first, USE_MSYNC ? MS_SYNC : MS_ASYNC)) {
+                    GGML_LOG_WARN("warning: msync failed: %s\n", strerror(errno));
+                }
+            }
+            if (munmap((char *) addr + frag.first, frag.second - frag.first)) {
+                GGML_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
+            }
+        }
+    }
+#elif defined(_WIN32)
+    HANDLE hFile;
+    impl(struct llama_file * file, size_t prefetch, bool numa, bool writable) : writable{ writable } {
+        GGML_UNUSED(numa);
+
+        size = file->size();
+
+        hFile = (HANDLE) _get_osfhandle(file->file_id());
+
+        HANDLE hMapping = CreateFileMappingA(hFile, NULL, writable ? PAGE_READWRITE : PAGE_READONLY, 0, 0, NULL);
+
+        if (hMapping == NULL) {
+            DWORD error = GetLastError();
+            throw std::runtime_error(std::string{"CreateFileMappingA failed: "} + llama_format_win_err(error).c_str());
+        }
+
+        addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
+        DWORD error = GetLastError();
+        CloseHandle(hMapping);
+
+        if (addr == NULL) {
+            throw std::runtime_error(std::string{"MapViewOfFile failed: "} + llama_format_win_err(error).c_str());
+        }
+
+        if (prefetch > 0) {
+#if _WIN32_WINNT >= 0x602 or true
+            BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
+            HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
+
+            pPrefetchVirtualMemory = (decltype(pPrefetchVirtualMemory))(void *) GetProcAddress(hKernel32, "PrefetchVirtualMemory");
+
+            if (pPrefetchVirtualMemory) {
+                WIN32_MEMORY_RANGE_ENTRY range;
+                range.VirtualAddress = addr;
+                range.NumberOfBytes = (SIZE_T) std::min(size, prefetch);
+                if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
+                    GGML_LOG_WARN("warning: PrefetchVirtualMemory failed: %s\n",
+                            llama_format_win_err(GetLastError()).c_str());
+                }
+            }
+#else
+            throw std::runtime_error("PrefetchVirtualMemory unavailable");
+#endif
+        }
+    }
+
+    void unmap_fragment(size_t first, size_t last) {
+        GGML_UNUSED(first);
+        GGML_UNUSED(last);
+    }
+
+    ~impl() {
+        if (writable && USE_MSYNC) {
+            if (!FlushViewOfFile(addr, 0)) {
+                GGML_LOG_WARN("warning: FlushViewOfFile failed: %s\n",
+                        llama_format_win_err(GetLastError()).c_str());
+            }
+            if (!FlushFileBuffers(hFile)) {
+                GGML_LOG_WARN("warning: FlushFileBuffers failed: %s\n",
+                        llama_format_win_err(GetLastError()).c_str());
+            }
+        }
+        if (!UnmapViewOfFile(addr)) {
+            GGML_LOG_WARN("warning: UnmapViewOfFile failed: %s\n",
+                    llama_format_win_err(GetLastError()).c_str());
+        }
+    }
+#else
+    impl(struct llama_file * file, size_t prefetch, bool numa, bool writable) {
+        GGML_UNUSED(file);
+        GGML_UNUSED(prefetch);
+        GGML_UNUSED(numa);
+        GGML_UNUSED(writable);
+
+        throw std::runtime_error("mmap not supported");
+    }
+
+    void unmap_fragment(size_t first, size_t last) {
+        GGML_UNUSED(first);
+        GGML_UNUSED(last);
+
+        throw std::runtime_error("mmap not supported");
+    }
+#endif
+
+    void * addr;
+    size_t size;
+    bool writable;
+};
+
+llama_mmap::llama_mmap(struct llama_file * file, size_t prefetch, bool numa, bool writable) : pimpl(std::make_unique<impl>(file, prefetch, numa, writable)) {}
+llama_mmap::~llama_mmap() = default;
+
+size_t llama_mmap::size() const { return pimpl->size; }
+void * llama_mmap::addr() const { return pimpl->addr; }
+
+void llama_mmap::unmap_fragment(size_t first, size_t last) { pimpl->unmap_fragment(first, last); }
+
+#if defined(_POSIX_MEMLOCK_RANGE) || defined(_WIN32)
+const bool llama_mmap::SUPPORTED  = true;
+#else
+const bool llama_mmap::SUPPORTED  = false;
+#endif
+
+// llama_mlock
+
+struct llama_mlock::impl {
+#ifdef _POSIX_MEMLOCK_RANGE
+    static size_t lock_granularity() {
+        return (size_t) sysconf(_SC_PAGESIZE);
+    }
+
+    bool raw_lock(const void * addr, size_t size) const {
+        if (!mlock(addr, size)) {
+            return true;
+        }
+
+#ifdef __APPLE__
+#define MLOCK_SUGGESTION \
+        "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
+        "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MEMLOCK (ulimit -l).\n"
+#else
+#define MLOCK_SUGGESTION \
+        "Try increasing RLIMIT_MEMLOCK ('ulimit -l' as root).\n"
+#endif
+
+        char* errmsg = std::strerror(errno);
+        bool suggest = (errno == ENOMEM);
+#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX)
+        // visionOS/tvOS dont't support RLIMIT_MEMLOCK
+        // Skip resource limit checks on visionOS/tvOS
+        suggest = false;
+#else
+        struct rlimit lock_limit;
+        if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
+            suggest = false;
+        }
+        if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
+            suggest = false;
+        }
+#endif
+
+        GGML_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
+                size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
+        return false;
+    }
+
+    static void raw_unlock(void * addr, size_t size) {
+        if (munlock(addr, size)) {
+            GGML_LOG_WARN("warning: failed to munlock buffer: %s\n", std::strerror(errno));
+        }
+    }
+#elif defined(_WIN32)
+    static size_t lock_granularity() {
+        SYSTEM_INFO si;
+        GetSystemInfo(&si);
+        return (size_t) si.dwPageSize;
+    }
+
+    bool raw_lock(void * ptr, size_t len) const {
+        for (int tries = 1; ; tries++) {
+            if (VirtualLock(ptr, len)) {
+                return true;
+            }
+            if (tries == 2) {
+                GGML_LOG_WARN("warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
+                    len, size, llama_format_win_err(GetLastError()).c_str());
+                return false;
+            }
+
+            SIZE_T min_ws_size, max_ws_size;
+            if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
+                GGML_LOG_WARN("warning: GetProcessWorkingSetSize failed: %s\n",
+                        llama_format_win_err(GetLastError()).c_str());
+                return false;
+            }
+            size_t increment = len + 1048576;
+            min_ws_size += increment;
+            max_ws_size += increment;
+            if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
+                GGML_LOG_WARN("warning: SetProcessWorkingSetSize failed: %s\n",
+                        llama_format_win_err(GetLastError()).c_str());
+                return false;
+            }
+        }
+    }
+
+    static void raw_unlock(void * ptr, size_t len) {
+        if (!VirtualUnlock(ptr, len)) {
+            GGML_LOG_WARN("warning: failed to VirtualUnlock buffer: %s\n",
+                    llama_format_win_err(GetLastError()).c_str());
+        }
+    }
+#else
+    static size_t lock_granularity() {
+        return (size_t) 65536;
+    }
+
+    bool raw_lock(const void * addr, size_t len) const {
+        GGML_LOG_WARN("warning: mlock not supported on this system\n");
+        return false;
+    }
+
+    static void raw_unlock(const void * addr, size_t len) {}
+#endif
+
+    impl() : addr(NULL), size(0), failed_already(false) {}
+
+    void init(void * ptr) {
+        GGML_ASSERT(addr == NULL && size == 0);
+        addr = ptr;
+    }
+
+    void grow_to(size_t target_size) {
+        GGML_ASSERT(addr);
+        if (failed_already) {
+            return;
+        }
+        size_t granularity = lock_granularity();
+        target_size = (target_size + granularity - 1) & ~(granularity - 1);
+        if (target_size > size) {
+            if (raw_lock((uint8_t *) addr + size, target_size - size)) {
+                size = target_size;
+            } else {
+                failed_already = true;
+            }
+        }
+    }
+
+    void * addr;
+    size_t size;
+
+    bool failed_already;
+};
+
+llama_mlock::llama_mlock() : pimpl(std::make_unique<impl>()) {}
+llama_mlock::~llama_mlock() = default;
+
+void llama_mlock::init(void * ptr) { pimpl->init(ptr); }
+void llama_mlock::grow_to(size_t target_size) { pimpl->grow_to(target_size); }
+
+#if defined(_POSIX_MEMLOCK_RANGE) || defined(_WIN32)
+const bool llama_mlock::SUPPORTED = true;
+#else
+const bool llama_mlock::SUPPORTED = false;
+#endif
+
+size_t llama_path_max() {
+    return PATH_MAX;
+}
diff --git a/ggml-patches/llama-mmap.h b/ggml-patches/llama-mmap.h
new file mode 100644
index 0000000..419579f
--- /dev/null
+++ b/ggml-patches/llama-mmap.h
@@ -0,0 +1,68 @@
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+struct llama_file;
+struct llama_mmap;
+struct llama_mlock;
+
+using llama_files  = std::vector<std::unique_ptr<llama_file>>;
+using llama_mmaps  = std::vector<std::unique_ptr<llama_mmap>>;
+using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
+
+struct llama_file {
+    llama_file(const char * fname, const char * mode);
+    ~llama_file();
+
+    size_t tell() const;
+    size_t size() const;
+
+    int file_id() const; // fileno overload
+
+    void seek(size_t offset, int whence) const;
+
+    void read_raw(void * ptr, size_t len) const;
+    uint32_t read_u32() const;
+
+    void write_raw(const void * ptr, size_t len) const;
+    void write_u32(uint32_t val) const;
+
+private:
+    struct impl;
+    std::unique_ptr<impl> pimpl;
+};
+
+struct llama_mmap {
+    llama_mmap(const llama_mmap &) = delete;
+    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false, bool writable = false);
+    ~llama_mmap();
+
+    size_t size() const;
+    void * addr() const;
+
+    void unmap_fragment(size_t first, size_t last);
+
+    static const bool SUPPORTED;
+
+private:
+    struct impl;
+    std::unique_ptr<impl> pimpl;
+};
+
+struct llama_mlock {
+    llama_mlock();
+    ~llama_mlock();
+
+    void init(void * ptr);
+    void grow_to(size_t target_size);
+
+    static const bool SUPPORTED;
+
+private:
+    struct impl;
+    std::unique_ptr<impl> pimpl;
+};
+
+size_t llama_path_max();
diff --git a/include/common.h b/include/common.h
index c3a1a1c..981b34c 100644
--- a/include/common.h
+++ b/include/common.h
@@ -1,11 +1,13 @@
-#ifndef common_h
-#define common_h
+#pragma once
 
 #include <cstdint>
 #include <string>
 #include <map>
+#include <memory>
 #include <vector>
 
+using namespace std;
+
 // Using this simple struct as opposed to a common std::vector allows us to return the cpu buffer
 // pointer directly rather than copying the contents of the buffer to a predefined std::vector.
 struct tts_response {
@@ -28,7 +30,7 @@ const std::map<std::string, tts_arch> SUPPORTED_ARCHITECTURES = {
 	{ "orpheus", ORPHEUS_ARCH }
 };
 
-/// Given a map from keys to values, creates a new map from values to keys 
+/// Given a map from keys to values, creates a new map from values to keys
 template<typename K, typename V>
 static std::map<V, K> reverse_map(const std::map<K, V>& m) {
     std::map<V, K> r;
@@ -43,10 +45,10 @@ const std::map<tts_arch, std::string> ARCHITECTURE_NAMES = reverse_map(SUPPORTED
 struct generation_configuration {
     generation_configuration(
     	std::string voice = "",
-    	int top_k = 50, 
-    	float temperature = 1.0, 
-    	float repetition_penalty = 1.0, 
-    	bool use_cross_attn = true, 
+    	int top_k = 50,
+    	float temperature = 1.0,
+    	float repetition_penalty = 1.0,
+    	bool use_cross_attn = true,
     	std::string espeak_voice_id = "",
     	int max_tokens = 0,
     	float top_p = 1.0,
@@ -64,17 +66,29 @@ struct generation_configuration {
 };
 
 struct tts_runner {
-	tts_arch arch;
 	struct ggml_context * ctx = nullptr;
 	float sampling_rate = 44100.0f;
 	bool supports_voices = false;
 
-	std::string arch_name() {
-		return ARCHITECTURE_NAMES.at(arch);
-	}
+    virtual ~tts_runner() = default;
 
 	void init_build(std::vector<uint8_t>* buf_compute_meta);
 	void free_build();
 };
 
-#endif
+struct ggml_tensor;
+struct tts_model_loader;
+struct llama_mmap;
+
+struct tts_generation_runner : tts_runner {
+    const reference_wrapper<const tts_model_loader> loader;
+    unique_ptr<llama_mmap> buf;
+    explicit tts_generation_runner(const tts_model_loader & loader);
+    ~tts_generation_runner() override;
+
+    virtual void                assign_weight(const char * name, ggml_tensor & tensor) = 0;
+    virtual void                prepare_post_load()                                    = 0;
+    virtual vector<string_view> list_voices();
+    virtual void                update_conditional_prompt(const char * file_path, const char * prompt);
+    virtual void generate(const char * sentence, tts_response & output, const generation_configuration & config) = 0;
+};
diff --git a/include/tts.h b/include/tts.h
deleted file mode 100644
index 30e98dc..0000000
--- a/include/tts.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef tts_h
-#define tts_h
-
-#include "parler_model.h"
-#include "kokoro_model.h"
-#include "dia_model.h"
-#include "orpheus_model.h"
-#include <thread>
-#include <fstream>
-#include <array>
-
-struct tts_runner * parler_tts_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only);
-struct tts_runner * kokoro_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only);
-struct tts_runner * dia_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only);
-struct tts_runner * orpheus_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only);
-struct tts_runner * runner_from_file(const std::string & fname, int n_threads, generation_configuration * config, bool cpu_only = true);
-int generate(tts_runner * runner, std::string sentence, struct tts_response * response, generation_configuration * config);
-void update_conditional_prompt(tts_runner * runner, const std::string file_path, const std::string prompt, bool cpu_only = true);
-std::vector<std::string> list_voices(tts_runner * runner);
-
-struct quantization_params {
-    quantization_params(uint32_t n_threads, enum ggml_type quantize_type): n_threads(n_threads), quantize_type(quantize_type) {};
-    uint32_t n_threads;
-    enum ggml_type quantize_type; // quantization type
-    bool quantize_output_heads = false;
-    bool quantize_text_embeddings = false;
-    bool quantize_cross_attn_kv = false;
-    bool convert_dac_to_f16 = false;
-    bool convert_non_quantizable_to_f16 = false;
-};
-
-void quantize_gguf(const std::string & ifile, const std::string & ofile, struct quantization_params * params);
-
-#endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 3d07940..77a1e2a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -2,45 +2,38 @@
 if (WIN32)
     if (BUILD_SHARED_LIBS)
         set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
-    endif()
-endif()
+    endif ()
+endif ()
 
 # TTS
 
 add_library(tts
-            ../include/tts.h
-            ../include/args.h
-            ../include/phonemizer.h
-            tts.cpp
-            tokenizer.cpp
-            sampler.cpp
-            parler_model.cpp
-            dac_model.cpp
-            util.cpp
-            args.cpp
-            t5_encoder_model.cpp
-            phonemizer.cpp
-            tts_model.cpp
-            kokoro_model.cpp
-            dia_model.cpp
-            orpheus_model.cpp
-            snac_model.cpp
-            general_neural_audio_codec.cpp
-            )
+        ../include/args.h
+        tokenizer.cpp
+        sampler.cpp
+        sampler.h
+        util.cpp
+        util.h
+        args.cpp
+        tts_model.cpp
+        tts_model.h
+)
 
 target_include_directories(tts PUBLIC . ../include ../ggml/src/)
 
-target_compile_features   (tts PUBLIC cxx_std_11) # don't bump
-
-if (ESPEAK_INCLUDE_DIRS)
-    set_source_files_properties(phonemizer.cpp PROPERTIES COMPILE_FLAGS "${ESPEAK_CFLAGS_OTHER}")
-    set_source_files_properties(phonemizer.cpp PROPERTIES INCLUDE_DIRECTORIES "${ESPEAK_INCLUDE_DIRS}")
-    target_link_libraries(tts PUBLIC ${ESPEAK_LIBRARIES})
-endif()
 target_link_libraries(tts PUBLIC ggml)
 
 if (BUILD_SHARED_LIBS)
     set_target_properties(tts PROPERTIES POSITION_INDEPENDENT_CODE ON)
     target_compile_definitions(tts PRIVATE LLAMA_BUILD)
-    target_compile_definitions(tts PUBLIC  LLAMA_SHARED)
-endif()
+    target_compile_definitions(tts PUBLIC LLAMA_SHARED)
+endif ()
+
+add_subdirectory(decoder)
+add_subdirectory(models)
+
+if (ESPEAK_INCLUDE_DIRS)
+    set_source_files_properties(models/kokoro/phonemizer.cpp PROPERTIES COMPILE_FLAGS "${ESPEAK_CFLAGS_OTHER}")
+    set_source_files_properties(models/kokoro/phonemizer.cpp PROPERTIES INCLUDE_DIRECTORIES "${ESPEAK_INCLUDE_DIRS}")
+    target_link_libraries(tts PUBLIC ${ESPEAK_LIBRARIES})
+endif ()
diff --git a/src/decoder/CMakeLists.txt b/src/decoder/CMakeLists.txt
new file mode 100644
index 0000000..ef05806
--- /dev/null
+++ b/src/decoder/CMakeLists.txt
@@ -0,0 +1,8 @@
+target_sources(tts PRIVATE
+        dac_model.cpp
+        dac_model.h
+        general_neural_audio_codec.cpp
+        general_neural_audio_codec.h
+        snac_model.cpp
+        snac_model.h
+)
diff --git a/src/dac_model.cpp b/src/decoder/dac_model.cpp
similarity index 100%
rename from src/dac_model.cpp
rename to src/decoder/dac_model.cpp
index 6685007..defce7e 100644
--- a/src/dac_model.cpp
+++ b/src/decoder/dac_model.cpp
@@ -1,4 +1,5 @@
 #include "dac_model.h"
+
 #include <algorithm>
 #include <stdexcept>
 
@@ -209,4 +210,3 @@ void dac_runner::run(uint32_t * input_tokens, uint32_t sequence_length, struct t
     outputs->n_outputs = sequence_length * model->up_sampling_factor;
     return;
 }
-
diff --git a/src/dac_model.h b/src/decoder/dac_model.h
similarity index 99%
rename from src/dac_model.h
rename to src/decoder/dac_model.h
index be43ad0..c74a095 100644
--- a/src/dac_model.h
+++ b/src/decoder/dac_model.h
@@ -1,9 +1,10 @@
 #ifndef dac_model_h
 #define dac_model_h
 
-#include "general_neural_audio_codec.h"
 #include <map>
 
+#include "general_neural_audio_codec.h"
+
 enum dac_tensor {
     DAC_ENCODER_IN_KERNEL,
     DAC_ENCODER_IN_BIAS,
diff --git a/src/general_neural_audio_codec.cpp b/src/decoder/general_neural_audio_codec.cpp
similarity index 99%
rename from src/general_neural_audio_codec.cpp
rename to src/decoder/general_neural_audio_codec.cpp
index 8f7893e..371afef 100644
--- a/src/general_neural_audio_codec.cpp
+++ b/src/decoder/general_neural_audio_codec.cpp
@@ -1,7 +1,8 @@
 #include "general_neural_audio_codec.h"
+
 #include <algorithm>
-#include <stdexcept>
 #include <map>
+#include <stdexcept>
 
 namespace general_neural_audio_codec {
     // This contains a mapping between string names and gguf_tensor enum values for the purposes of assigning the weights from a gguf file
diff --git a/src/general_neural_audio_codec.h b/src/decoder/general_neural_audio_codec.h
similarity index 99%
rename from src/general_neural_audio_codec.h
rename to src/decoder/general_neural_audio_codec.h
index 1ec0a42..7ea7977 100644
--- a/src/general_neural_audio_codec.h
+++ b/src/decoder/general_neural_audio_codec.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "tts_model.h"
+#include "../tts_model.h"
 
 // This namespace implements a general abstraction of the core functionality used in common neural audio codecs like DAC and SNAC.
 namespace general_neural_audio_codec {
diff --git a/src/snac_model.cpp b/src/decoder/snac_model.cpp
similarity index 100%
rename from src/snac_model.cpp
rename to src/decoder/snac_model.cpp
diff --git a/src/snac_model.h b/src/decoder/snac_model.h
similarity index 100%
rename from src/snac_model.h
rename to src/decoder/snac_model.h
diff --git a/src/models/CMakeLists.txt b/src/models/CMakeLists.txt
new file mode 100644
index 0000000..861ac5e
--- /dev/null
+++ b/src/models/CMakeLists.txt
@@ -0,0 +1,9 @@
+target_sources(tts PRIVATE
+        loaders.cpp
+        loaders.h
+)
+
+add_subdirectory(dia)
+add_subdirectory(kokoro)
+add_subdirectory(orpheus)
+add_subdirectory(parler)
diff --git a/src/models/dia/CMakeLists.txt b/src/models/dia/CMakeLists.txt
new file mode 100644
index 0000000..2e86004
--- /dev/null
+++ b/src/models/dia/CMakeLists.txt
@@ -0,0 +1,5 @@
+target_sources(tts PRIVATE
+        loader.cpp
+        model.cpp
+        model.h
+)
diff --git a/src/models/dia/loader.cpp b/src/models/dia/loader.cpp
new file mode 100644
index 0000000..7bad51e
--- /dev/null
+++ b/src/models/dia/loader.cpp
@@ -0,0 +1,23 @@
+#include "../loaders.h"
+#include "model.h"
+
+void dia_register() {}
+
+dia_model_loader::dia_model_loader() : tts_model_loader{ "dia" } {}
+
+unique_ptr<tts_generation_runner> dia_model_loader::from_file(gguf_context * meta_ctx, ggml_context * weight_ctx,
+                                                              int n_threads, bool cpu_only,
+                                                              const generation_configuration & config) const {
+    dia_model * model       = new dia_model;
+    dac_model * audio_model = new dac_model;
+    model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
+    audio_model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
+    sampler *      samp          = new sampler;
+    dac_context *  dctx          = build_new_dac_context(audio_model, n_threads, cpu_only);
+    dac_runner *   audio_decoder = new dac_runner(audio_model, dctx);
+    dia_context *  diactx        = build_new_dia_context(model, n_threads, cpu_only);
+    dia_kv_cache * cache         = new dia_kv_cache;
+    return make_unique<dia_runner>(model, audio_decoder, diactx, samp, cache);
+}
+
+const dia_model_loader dia_loader{};
diff --git a/src/dia_model.cpp b/src/models/dia/model.cpp
similarity index 97%
rename from src/dia_model.cpp
rename to src/models/dia/model.cpp
index bd6dfd4..72d4b38 100644
--- a/src/dia_model.cpp
+++ b/src/models/dia/model.cpp
@@ -1,4 +1,4 @@
-#include "dia_model.h"
+#include "model.h"
 
 void dia_model::assign_weight(std::string name, struct ggml_tensor * tensor) {
     std::vector<std::string> parts = split(name, ".");
@@ -720,16 +720,6 @@ struct ggml_cgraph * dia_runner::build_dia_graph(dia_ubatch & batch) {
     return gf;
 }
 
-void dia_runner::configure_generation(generation_configuration * config) {
-    GGML_ASSERT(config->max_tokens == 0 || config->max_tokens > model->max_delay);
-    decode_sampler->temperature = config->temperature;
-    decode_sampler->repetition_penalty = config->repetition_penalty;
-    decode_sampler->do_sample = config->sample;
-    decode_sampler->top_k = config->top_k;
-    decode_sampler->top_p = config->top_p;
-    dctx->max_generation_size = config->max_tokens > model->max_delay ? config->max_tokens : model->max_generation_size;
-}
-
 void dia_runner::set_inputs(dia_ubatch & batch) {
     if (batch.encoder_step) {
         ggml_backend_tensor_set(dctx->inp_tokens, batch.tokens.data(), 0, batch.tokens.size()*ggml_element_size(dctx->inp_tokens));
@@ -856,7 +846,7 @@ void dia_runner::adjust_output_tokens(std::vector<uint32_t> & output_tokens, std
     }
 }
 
-int dia_runner::generate_from_batch(dia_ubatch & batch, struct tts_response * output) {
+int dia_runner::generate_from_batch(dia_ubatch & batch, tts_response & output) {
     while (!check_stopping(batch)) {
         int state = decode(batch);
         if (state != 0) {
@@ -875,11 +865,19 @@ int dia_runner::generate_from_batch(dia_ubatch & batch, struct tts_response * ou
     std::vector<uint32_t> filtered_output_tokens;
     adjust_output_tokens(dctx->output_tokens, filtered_output_tokens);
 
-    dac_runner->run(filtered_output_tokens.data(), (int32_t) filtered_output_tokens.size() / model->n_output_heads, output);
+    dac_runner->run(filtered_output_tokens.data(), (int32_t) filtered_output_tokens.size() / model->n_output_heads, &output);
     return 0;
 }
 
-int dia_runner::generate(std::string sentence, struct tts_response * output) {
+void dia_runner::generate(const char * sentence, tts_response & output, const generation_configuration & config) {
+    GGML_ASSERT(config.max_tokens == 0 || config.max_tokens > model->max_delay);
+    decode_sampler->temperature        = config.temperature;
+    decode_sampler->repetition_penalty = config.repetition_penalty;
+    decode_sampler->do_sample          = config.sample;
+    decode_sampler->top_k              = config.top_k;
+    decode_sampler->top_p              = config.top_p;
+    dctx->max_generation_size = config.max_tokens > model->max_delay ? config.max_tokens : model->max_generation_size;
+
     dia_ubatch batch = batch_from_sentence(sentence);
     dctx->reset();
     decode_sampler->reset();
@@ -887,25 +885,16 @@ int dia_runner::generate(std::string sentence, struct tts_response * output) {
     if (!kv_cross_self) {
         kv_cross_self = new dia_kv_cache;
         if (!dia_kv_cache_init(kv_cross_self, model, dctx)) {
-            return 1;
+            return;
         }
     }
-    return generate_from_batch(batch, output);
+    generate_from_batch(batch, output);
 }
 
-void dia_runner::assign_weight(std::string name, ggml_tensor * tensor) {
-    if (tensor->data == NULL) {
-        return;
-    }
-
-    if (name.size() == 0) {
-        // handles the top level meta tensor
-        return;
-    }
-
-    if (name.size() > 14 && name.substr(0, 14) == "audio_encoder.") {
-        dac_runner->model->assign_weight(name.substr(14), tensor);
+void dia_runner::assign_weight(const char * name, ggml_tensor & tensor) {
+    if (const string_view name_sv{ name }; name_sv.starts_with("audio_encoder.")) {
+        dac_runner->model->assign_weight(string{ name_sv.substr(sizeof("audio_encoder.") - 1) }, &tensor);
     } else {
-        model->assign_weight(name, tensor);
-    }   
+        model->assign_weight(name, &tensor);
+    }
 }
diff --git a/src/dia_model.h b/src/models/dia/model.h
similarity index 89%
rename from src/dia_model.h
rename to src/models/dia/model.h
index bdca91d..8a36f9c 100644
--- a/src/dia_model.h
+++ b/src/models/dia/model.h
@@ -1,7 +1,17 @@
 #pragma once
 
-#include "dac_model.h"
-#include "sampler.h"
+#include "../../decoder/dac_model.h"
+#include "../../sampler.h"
+#include "models/loaders.h"
+
+extern const struct dia_model_loader final : tts_model_loader {
+    explicit dia_model_loader();
+
+    unique_ptr<tts_generation_runner> from_file(gguf_context * meta_ctx,
+                                     ggml_context * weight_ctx, int n_threads, bool cpu_only,
+                                     const generation_configuration & config) const override;
+} dia_loader;
+
 
 struct dia_encoder_layer {
     struct ggml_tensor * k;
@@ -165,8 +175,9 @@ static struct ggml_tensor * build_dia_decoder( ggml_cgraph * gf, ggml_context *
 
 // This struct is intended to support end-to-end TTS generation for the Dia model. As such, it manages Dia's model compilation, compute, generation,
 // tokenizationm and sampling process, and uses the dac_runner struct to encode audio outputs.
-struct dia_runner : tts_runner {
-    dia_runner(dia_model * model, dac_runner * audio_decoder, dia_context * dctx, sampler * samp, dia_kv_cache * cache): model(model), dac_runner(audio_decoder), dctx(dctx), decode_sampler(samp), kv_cross_self(cache) {
+struct dia_runner : tts_generation_runner {
+    dia_runner(dia_model * model, dac_runner * audio_decoder, dia_context * dctx, sampler * samp, dia_kv_cache * cache):
+    tts_generation_runner{dia_loader}, model(model), dac_runner(audio_decoder), dctx(dctx), decode_sampler(samp), kv_cross_self(cache) {
         decode_sampler->vocab_size = model->output_vocab_size;
     };
     ~dia_runner() {
@@ -192,15 +203,14 @@ struct dia_runner : tts_runner {
 
     void tokenize_sentence(std::string sentence, dia_ubatch & tokens);
     dia_ubatch batch_from_sentence(std::string sentence);
-    void configure_generation(generation_configuration * config);
-    void assign_weight(std::string name, ggml_tensor * tensor);
+    void assign_weight(const char * name, ggml_tensor & tensor) override;
     dia_ubatch build_worst_case_batch();
     struct ggml_cgraph * build_dia_graph(dia_ubatch & batch);
     void set_inputs(dia_ubatch & batch);
     int decode(dia_ubatch & batch);
-    void prepare_post_load();
-    int generate(std::string sentence, struct tts_response * response);
+    void prepare_post_load() override;
+    void generate(const char * sentence, tts_response & response, const generation_configuration & config) override;
     bool check_stopping(dia_ubatch & batch);
     void adjust_output_tokens(std::vector<uint32_t> & output_tokens, std::vector<uint32_t> & filtered);
-    int generate_from_batch(dia_ubatch & batch, struct tts_response * output);
+    int generate_from_batch(dia_ubatch & batch, tts_response & output);
 };
diff --git a/src/models/kokoro/CMakeLists.txt b/src/models/kokoro/CMakeLists.txt
new file mode 100644
index 0000000..db438ca
--- /dev/null
+++ b/src/models/kokoro/CMakeLists.txt
@@ -0,0 +1,7 @@
+target_sources(tts PRIVATE
+        loader.cpp
+        model.cpp
+        model.h
+        phonemizer.cpp
+        phonemizer.h
+)
diff --git a/src/models/kokoro/loader.cpp b/src/models/kokoro/loader.cpp
new file mode 100644
index 0000000..3437cf1
--- /dev/null
+++ b/src/models/kokoro/loader.cpp
@@ -0,0 +1,27 @@
+#include "../loaders.h"
+#include "model.h"
+
+void kokoro_register() {}
+
+kokoro_model_loader::kokoro_model_loader() : tts_model_loader{ "kokoro" } {}
+
+unique_ptr<tts_generation_runner> kokoro_model_loader::from_file(
+    gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, bool cpu_only,
+    const generation_configuration & config) const {
+    unique_ptr<kokoro_model> model = make_unique<kokoro_model>();
+    single_pass_tokenizer *  spt   = single_pass_tokenizer_from_gguf(meta_ctx, "tokenizer.ggml.tokens");
+    model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
+    kokoro_duration_context * kdctx           = build_new_duration_kokoro_context(&*model, n_threads, cpu_only);
+    auto *                    duration_runner = new kokoro_duration_runner(&*model, kdctx, spt);
+    kokoro_context *          kctx            = build_new_kokoro_context(&*model, n_threads, cpu_only);
+    // if an espeak voice id wasn't specifically set infer it from the kokoro voice,
+    // if it was override it, otherwise fallback to American English.
+    const char *              espeak_voice_id{ config.espeak_voice_id.c_str() };
+    if (!*espeak_voice_id) {
+        espeak_voice_id = get_espeak_id_from_kokoro_voice(config.voice);
+    }
+    phonemizer * phmzr = phonemizer_from_gguf(meta_ctx, espeak_voice_id);
+    return make_unique<kokoro_runner>(move(model), kctx, spt, duration_runner, phmzr);
+}
+
+const kokoro_model_loader kokoro_loader{};
diff --git a/src/kokoro_model.cpp b/src/models/kokoro/model.cpp
similarity index 95%
rename from src/kokoro_model.cpp
rename to src/models/kokoro/model.cpp
index 70f1972..72a0d36 100644
--- a/src/kokoro_model.cpp
+++ b/src/models/kokoro/model.cpp
@@ -1,4 +1,4 @@
-#include "kokoro_model.h"
+#include "model.h"
 
 static struct ggml_tensor * build_albert_attn_mask(ggml_context * ctx, struct kokoro_duration_context *kctx, const kokoro_ubatch & batch) {
     kctx->attn_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, (int64_t) batch.n_tokens, (int64_t) batch.n_tokens);
@@ -62,16 +62,16 @@ static struct ggml_tensor * build_lstm_run(ggml_context * ctx, ggml_cgraph * gf,
 		int i = reversed ? sequence_length - 1 - index : index;
 		struct ggml_tensor * I_cur = ggml_view_3d(ctx, I, I->ne[0], 1, I->ne[2], I->nb[0], I->nb[1], I->nb[1]*i);
 		I_cur = ggml_sigmoid(ctx, ggml_add(ctx, I_cur, ggml_add(ctx, ggml_mul_mat(ctx, weights[1], h_0), biases[1])));
-		
+
 		struct ggml_tensor * F_cur = ggml_view_3d(ctx, F, F->ne[0], 1, F->ne[2], F->nb[0], F->nb[1], F->nb[1]*i);
 		F_cur = ggml_sigmoid(ctx, ggml_add(ctx, F_cur, ggml_add(ctx, ggml_mul_mat(ctx, weights[3], h_0), biases[3])));
-		
+
 		struct ggml_tensor * G_cur = ggml_view_3d(ctx, G, G->ne[0], 1, G->ne[2], G->nb[0], G->nb[1], G->nb[1]*i);
 		G_cur = ggml_tanh(ctx, ggml_add(ctx, G_cur, ggml_add(ctx, ggml_mul_mat(ctx, weights[5], h_0), biases[5])));
-		
+
 		struct ggml_tensor * O_cur = ggml_view_3d(ctx, O, O->ne[0], 1, O->ne[2], O->nb[0], O->nb[1], O->nb[1]*i);
 		O_cur = ggml_sigmoid(ctx, ggml_add(ctx, O_cur, ggml_add(ctx, ggml_mul_mat(ctx, weights[7], h_0), biases[7])));
-		
+
 		c_0 = ggml_add(ctx, ggml_mul(ctx, F_cur, c_0), ggml_mul(ctx, I_cur, G_cur));
 		h_0 = ggml_mul(ctx, ggml_tanh(ctx, c_0), O_cur);
 
@@ -210,7 +210,7 @@ static struct ggml_tensor * build_generator(ggml_context * ctx, kokoro_model * m
 		cur = ggml_leaky_relu(ctx, cur, 0.1f, false);
 		cur = ggml_add(ctx, ggml_conv_transpose_1d(ctx, generator->ups[i]->upsample_weight, ggml_cont(ctx, ggml_transpose(ctx, cur)), generator->ups[i]->stride, generator->ups[i]->padding, 1, 0, 1), generator->ups[i]->upsample_bias);
 		if (i == generator->ups.size() - 1) {
-			// This is a hacky way of implementing the simple reflection padding used here. 
+			// This is a hacky way of implementing the simple reflection padding used here.
 			// In general, ggml should eventually be built to support expressive reflective padding but for such simple front padding this makes more sense.
 			struct ggml_tensor * temp = ggml_cont(ctx, ggml_view_3d(ctx, cur, 1, cur->ne[1], cur->ne[2], cur->nb[1], cur->nb[2], cur->nb[0]));
 			cur = ggml_concat(ctx, temp, cur, 0);
@@ -232,8 +232,8 @@ static struct ggml_tensor * build_generator(ggml_context * ctx, kokoro_model * m
 	cur = ggml_leaky_relu(ctx, cur, 0.01f, false);
 	cur = ggml_add(ctx, ggml_conv_1d(ctx, generator->out_conv_weight, ggml_cont(ctx, ggml_transpose(ctx, cur)), 1, model->out_conv_padding, 1), generator->out_conv_bias);
 
-	struct ggml_tensor * spec = ggml_view_3d(ctx, cur, cur->ne[0], model->post_n_fft, cur->ne[2], cur->nb[1], cur->nb[2], 0);  
-	struct ggml_tensor * phase = ggml_view_3d(ctx, cur, cur->ne[0], cur->ne[1] - model->post_n_fft, cur->ne[2], cur->nb[1], cur->nb[2], cur->nb[1] * model->post_n_fft);  
+	struct ggml_tensor * spec = ggml_view_3d(ctx, cur, cur->ne[0], model->post_n_fft, cur->ne[2], cur->nb[1], cur->nb[2], 0);
+	struct ggml_tensor * phase = ggml_view_3d(ctx, cur, cur->ne[0], cur->ne[1] - model->post_n_fft, cur->ne[2], cur->nb[1], cur->nb[2], cur->nb[1] * model->post_n_fft);
 	phase = ggml_sin(ctx, phase);
 	spec = ggml_exp(ctx, spec);
 
@@ -385,7 +385,7 @@ void kokoro_model::post_load_assign() {
    	sampling_factor_scalar->data = (void *)((uint8_t *) ggml_backend_buffer_get_base(buf) + offset);
     size_t scsize = ggml_nbytes(sampling_factor_scalar);
     // while it might appear that the upsampling_rate could be used here, the interpolation rate (i.e. the upsampling scale) is actually independent in the kokoro model implementation.
-    float sample_scalar = upsample_scale*2.0f*M_PI; 
+    float sample_scalar = upsample_scale*2.0f*M_PI;
 	ggml_backend_tensor_set(sampling_factor_scalar, &sample_scalar, 0, scsize);
 	offset += scsize;
 	post_load_tensor_bytes = 300 + offset - original_offset;
@@ -410,24 +410,20 @@ void kokoro_model::assign_lstm(lstm * rnn, std::string name, ggml_tensor * tenso
 	}
 }
 
-void kokoro_model::assign_weight(std::string name, ggml_tensor * tensor) {
-	// all kokoro tensors are prepended by "kokoro" so lets trim that off and assign based on the module
-	std::vector<std::string> parts = split(name, ".");
-	if (parts.size() < 2) {
-		return; // handle the null context tensor;
-	}
-	if (parts[1] == "albert") {
-		assign_albert_weight(name.substr(7+parts[1].size()+1), tensor);
-	} else if (parts[1] == "duration_predictor") {
-		assign_duration_weight(name.substr(7+parts[1].size()+1), tensor);
-	} else if (parts[1] == "text_encoder") {
-		assign_text_encoder_weight(name.substr(7+parts[1].size()+1), tensor);
-	} else if (parts[1] == "decoder") {
-		assign_decoder_weight(name.substr(7+parts[1].size()+1), tensor);
-	} else if (parts[1] == "voice_tensors") {
-		voices[parts[2]] = ggml_dup_tensor(ctx, tensor);
-		set_tensor(voices[parts[2]], tensor);
-	}
+void kokoro_model::assign_weight(const char * name, ggml_tensor & tensor) {
+    if (const string_view name_sv{ name }; name_sv.starts_with("albert.")) {
+        assign_albert_weight(string{ name_sv.substr(sizeof("albert.") - 1) }, &tensor);
+    } else if (name_sv.starts_with("duration_predictor.")) {
+        assign_duration_weight(string{ name_sv.substr(sizeof("duration_predictor.") - 1) }, &tensor);
+    } else if (name_sv.starts_with("text_encoder.")) {
+        assign_text_encoder_weight(string{ name_sv.substr(sizeof("text_encoder.") - 1) }, &tensor);
+    } else if (name_sv.starts_with("decoder.")) {
+        assign_decoder_weight(string{ name_sv.substr(sizeof("decoder.") - 1) }, &tensor);
+    } else if (name_sv.starts_with("voice_tensors.")) {
+        const string voice{ name_sv.substr(sizeof("voice_tensors.") - 1) };
+        voices[voice] = ggml_dup_tensor(ctx, &tensor);
+        set_tensor(voices[voice], &tensor);
+    }
 }
 
 void kokoro_model::assign_generator_weight(kokoro_generator * generator, std::string name, ggml_tensor * tensor) {
@@ -484,7 +480,7 @@ void kokoro_model::assign_gen_resblock(kokoro_generator_residual_block * block,
 		set_tensor(block->adain1d_1_gamma_biases[i], tensor);
 	} else if (parts[1] == "gamma2_bias") {
 		block->adain1d_2_gamma_biases[i] = ggml_dup_tensor(ctx, tensor);
-		set_tensor(block->adain1d_2_gamma_biases[i], tensor);		
+		set_tensor(block->adain1d_2_gamma_biases[i], tensor);
 	} else if (parts[1] == "beta1_weight") {
 		block->adain1d_1_beta_weights[i] = ggml_dup_tensor(ctx, tensor);
 		set_tensor(block->adain1d_1_beta_weights[i], tensor);
@@ -496,7 +492,7 @@ void kokoro_model::assign_gen_resblock(kokoro_generator_residual_block * block,
 		set_tensor(block->adain1d_1_beta_biases[i], tensor);
 	} else if (parts[1] == "beta2_bias") {
 		block->adain1d_2_beta_biases[i] = ggml_dup_tensor(ctx, tensor);
-		set_tensor(block->adain1d_2_beta_biases[i], tensor);		
+		set_tensor(block->adain1d_2_beta_biases[i], tensor);
 	} else if (parts[1] == "convs1_weight") {
 		block->convs1_weights[i] = ggml_dup_tensor(ctx, tensor);
 		set_tensor(block->convs1_weights[i], tensor);
@@ -508,13 +504,13 @@ void kokoro_model::assign_gen_resblock(kokoro_generator_residual_block * block,
 		set_tensor(block->convs1_biases[i], tensor);
 	} else if (parts[1] == "convs2_bias") {
 		block->convs2_biases[i] = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor));
-		set_tensor(block->convs2_biases[i], tensor);		
+		set_tensor(block->convs2_biases[i], tensor);
 	} else if (parts[1] == "alpha1") {
 		block->input_alphas[i] = ggml_dup_tensor(ctx, tensor);
 		set_tensor(block->input_alphas[i], tensor);
 	} else if (parts[1] == "alpha2") {
 		block->output_alphas[i] = ggml_dup_tensor(ctx, tensor);
-		set_tensor(block->output_alphas[i], tensor);		
+		set_tensor(block->output_alphas[i], tensor);
 	}
 }
 
@@ -540,7 +536,7 @@ void kokoro_model::assign_ada_res_block(ada_residual_conv_block * block, std::st
 		set_tensor(block->norm1_gamma_bias, tensor);
 	} else if (name == "norm2_gamma_bias") {
 		block->norm2_gamma_bias = ggml_dup_tensor(ctx, tensor);
-		set_tensor(block->norm2_gamma_bias, tensor);		
+		set_tensor(block->norm2_gamma_bias, tensor);
 	} else if (name == "norm1_beta_weight") {
 		block->norm1_beta = ggml_dup_tensor(ctx, tensor);
 		set_tensor(block->norm1_beta, tensor);
@@ -552,7 +548,7 @@ void kokoro_model::assign_ada_res_block(ada_residual_conv_block * block, std::st
 		set_tensor(block->norm1_beta_bias, tensor);
 	} else if (name == "norm2_beta_bias") {
 		block->norm2_beta_bias = ggml_dup_tensor(ctx, tensor);
-		set_tensor(block->norm2_beta_bias, tensor);		
+		set_tensor(block->norm2_beta_bias, tensor);
 	} else if (name == "conv1_weight") {
 		block->conv1 = ggml_dup_tensor(ctx, tensor);
 		set_tensor(block->conv1, tensor);
@@ -564,20 +560,20 @@ void kokoro_model::assign_ada_res_block(ada_residual_conv_block * block, std::st
 		set_tensor(block->conv1_bias, tensor);
 	} else if (name == "conv2_bias") {
 		block->conv2_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor));
-		set_tensor(block->conv2_bias, tensor);		
+		set_tensor(block->conv2_bias, tensor);
 	} else if (name == "pool_weight") {
 		block->pool = ggml_dup_tensor(ctx, tensor);
 		set_tensor(block->pool, tensor);
 	} else if (name == "pool_bias") {
 		block->pool_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor));
-		set_tensor(block->pool_bias, tensor);		
+		set_tensor(block->pool_bias, tensor);
 	} else if (name == "conv1x1_weight") {
 		tensor = squeeze_3d_2d_e0(ctx, tensor);
 		block->upsample = ggml_dup_tensor(ctx, tensor);
 		set_tensor(block->upsample, tensor);
 	} else if (name == "conv1x1_bias") {
 		block->upsample_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor));
-		set_tensor(block->upsample_bias, tensor);		
+		set_tensor(block->upsample_bias, tensor);
 	}
 }
 
@@ -853,7 +849,7 @@ void kokoro_model::prep_constants(gguf_context * meta) {
     if (vocab_size_key != -1) {
         vocab_size = gguf_get_val_u32(meta, vocab_size_key);
     }
-    
+
     int hidden_size_key = gguf_find_key(meta, "kokoro.duration_predictor.albert.hidden_size");
     if (hidden_size_key != -1) {
         hidden_size = gguf_get_val_u32(meta, hidden_size_key);
@@ -967,7 +963,7 @@ struct ggml_cgraph * kokoro_duration_runner::build_kokoro_duration_graph(kokoro_
     cur = inpL;
 
     struct ggml_tensor * KQ_mask_dec = build_albert_attn_mask(ctx, kctx, batch);
-    
+
     for (int r = 0; r < model->n_recurrence; r++) {
     	for (int l = 0; l < model->n_layers; l++) {
 	        struct ggml_tensor * residual = cur ;
@@ -1046,7 +1042,7 @@ struct ggml_cgraph * kokoro_duration_runner::build_kokoro_duration_graph(kokoro_
     ggml_build_forward_expand(gf, len);
 
     free_build();
-    
+
     return gf;
 }
 
@@ -1087,7 +1083,7 @@ void kokoro_duration_runner::run(kokoro_ubatch & batch) {
 
     prev_size = kctx->buf_len_output ? ggml_backend_buffer_get_size(kctx->buf_len_output) : 0;
     new_size = model->max_context_length * sizeof(float);
-    
+
     if (!kctx->buf_len_output || prev_size < new_size) {
         if (kctx->buf_output) {
             ggml_backend_buffer_free(kctx->buf_len_output);
@@ -1097,22 +1093,22 @@ void kokoro_duration_runner::run(kokoro_ubatch & batch) {
 
         kctx->buf_len_output = ggml_backend_buft_alloc_buffer(kctx->backend_cpu_buffer, new_size);
     }
-    
-    
+
+
     batch.resp->hidden_states = (float *) ggml_backend_buffer_get_base(kctx->buf_output);
     ggml_backend_buffer_clear(kctx->buf_output, 0);
     batch.resp->lengths = (float *) ggml_backend_buffer_get_base(kctx->buf_len_output);
     ggml_backend_buffer_clear(kctx->buf_len_output, 0);
-    
+
     struct ggml_cgraph * gf = NULL;
     gf = build_kokoro_duration_graph(batch);
-    
+
     // the output is always the last tensor in the graph
     struct ggml_tensor * lens = gf->nodes[gf->n_nodes - 1];
     // the reused duration hidden states are computed before a node chunk which has a size that is sequence length dependent
     struct ggml_tensor * hidden_states = gf->nodes[gf->n_nodes - 22 - 52 * batch.n_tokens];
     ggml_backend_sched_alloc_graph(kctx->sched, gf);
-    
+
     set_inputs(batch);
 
     ggml_backend_sched_graph_compute_async(kctx->sched, gf);
@@ -1192,7 +1188,7 @@ struct ggml_cgraph * kokoro_runner::build_kokoro_graph(kokoro_ubatch & batch) {
 	n = ggml_add(ctx, n, model->prosody_pred->n_proj_bias);
 	ggml_set_name(n, "n_out");
 	ggml_build_forward_expand(gf, n);
-    
+
 	// kokoro text encoding;
 	struct ggml_tensor * asr;
 	//struct ggml_tensor * embd;
@@ -1210,7 +1206,7 @@ struct ggml_cgraph * kokoro_runner::build_kokoro_graph(kokoro_ubatch & batch) {
 		asr = ggml_mul_mat(ctx, ggml_cont(ctx, ggml_transpose(ctx, cur)), ggml_cont(ctx, ggml_transpose(ctx, kctx->duration_mask)));
 	}
 
-	// decoding and generation prep 
+	// decoding and generation prep
 	struct ggml_tensor * asr_res;
 	struct ggml_tensor * f0;
 	struct ggml_tensor * n_base;
@@ -1239,7 +1235,7 @@ struct ggml_cgraph * kokoro_runner::build_kokoro_graph(kokoro_ubatch & batch) {
 	ggml_set_input(kctx->window_sq_sum);
 
 	// run generation
-	cur = build_generator(ctx, model, kctx, cur, style_half2, f0_curve, model->decoder->generator, (int)kctx->sequence_length, kctx->window_sq_sum, gf);
+	cur = build_generator(ctx, &*model, kctx, cur, style_half2, f0_curve, model->decoder->generator, (int)kctx->sequence_length, kctx->window_sq_sum, gf);
     ggml_build_forward_expand(gf, cur);
     free_build();
     return gf;
@@ -1277,7 +1273,7 @@ void kokoro_runner::set_inputs(kokoro_ubatch & batch, uint32_t total_size) {
     }
 }
 
-void kokoro_runner::run(kokoro_ubatch & batch, tts_response * outputs) {
+void kokoro_runner::run(kokoro_ubatch & batch, tts_response & outputs) {
 	batch.resp = new kokoro_duration_response;
 	drunner->run(batch);
 
@@ -1299,7 +1295,7 @@ void kokoro_runner::run(kokoro_ubatch & batch, tts_response * outputs) {
         kctx->buf_output = ggml_backend_buft_alloc_buffer(kctx->backend_cpu_buffer, new_size);
     }
 
-    outputs->data = (float *) ggml_backend_buffer_get_base(kctx->buf_output);
+    outputs.data = (float *) ggml_backend_buffer_get_base(kctx->buf_output);
     ggml_backend_buffer_clear(kctx->buf_output, 0);
 
     kctx->sequence_length = batch.n_tokens;
@@ -1307,34 +1303,37 @@ void kokoro_runner::run(kokoro_ubatch & batch, tts_response * outputs) {
 
     struct ggml_cgraph * gf = NULL;
     gf = build_kokoro_graph(batch);
-    
+
     // the output is always the last tensor in the graph
     struct ggml_tensor * output = gf->nodes[gf->n_nodes - 1];
 
     ggml_backend_sched_alloc_graph(kctx->sched, gf);
-    
+
     set_inputs(batch, total_length);
 
     ggml_backend_sched_graph_compute_async(kctx->sched, gf);
 
-    kctx->get_ggml_node_data(output, outputs->data, new_size);
+    kctx->get_ggml_node_data(output, outputs.data, new_size);
 
     // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
     // overlap with device computation.
     ggml_backend_sched_reset(kctx->sched);
-    outputs->n_outputs = total_length*model->up_sampling_factor;
+    outputs.n_outputs = total_length*model->up_sampling_factor;
     free(batch.resp);
     return;
 }
 
-void kokoro_runner::assign_weight(std::string name, ggml_tensor * tensor) {
-	model->assign_weight(name, tensor);
+void kokoro_runner::assign_weight(const char * name, ggml_tensor & tensor) {
+    const string_view name_sv{ name };
+    GGML_ASSERT(name_sv.starts_with("kokoro."));
+    const string trimmed{ name_sv.substr(sizeof("kokoro.") - 1) };
+    model->assign_weight(trimmed.c_str(), tensor);
 }
 
 /*
  * #tokenize_chunks is used to split up a larger than max context size (512) token prompt into discrete
  * blocks for generation. This solution, in accordance with Kokoro's pyTorch implementation, splits
- * the prompt by sentence when possible (this can result in slower inference but generally produces cleaner 
+ * the prompt by sentence when possible (this can result in slower inference but generally produces cleaner
  * speech). If a disinct sentence is too long, then it splits at the nearest space.
  */
 std::vector<std::vector<uint32_t>> kokoro_runner::tokenize_chunks(std::vector<std::string> clauses) {
@@ -1343,7 +1342,7 @@ std::vector<std::vector<uint32_t>> kokoro_runner::tokenize_chunks(std::vector<st
 		clause = strip(clause);
 		if (clause.empty()) {
 			continue;
-		} 
+		}
 		std::vector<uint32_t> tokens;
 		tokens.push_back(model->bos_token_id);
 		tokenizer->tokenize(clause, tokens);
@@ -1387,33 +1386,35 @@ std::vector<std::vector<uint32_t>> kokoro_runner::tokenize_chunks(std::vector<st
 	return chunks;
 }
 
-int kokoro_runner::generate(std::string prompt, struct tts_response * response, std::string voice, std::string voice_code) {
-	if (model->voices.find(voice) == model->voices.end()) {
-		TTS_ABORT("Failed to find Kokoro voice '%s' aborting.\n", voice.c_str());
+void kokoro_runner::generate(const char * prompt, tts_response & response, const generation_configuration & config) {
+	if (model->voices.find(config.voice) == model->voices.end()) {
+		TTS_ABORT("Failed to find Kokoro voice '%s' aborting.\n", config.voice.c_str());
     } else {
     	// if the language changed then we should change the phonemization voice
-    	if (phmzr->mode == ESPEAK && kctx->voice[0] != voice[0]) {
+    	if (phmzr->mode == ESPEAK && kctx->voice[0] != config.voice[0]) {
+            std::string voice_code{config.espeak_voice_id};
     		if (voice_code.empty()) {
-    			voice_code = get_espeak_id_from_kokoro_voice(voice);
+    			voice_code = get_espeak_id_from_kokoro_voice(config.voice);
     		}
     		update_voice(voice_code);
     	}
-        kctx->voice = voice;
-        drunner->kctx->voice = voice;
+        kctx->voice = config.voice;
+        drunner->kctx->voice = config.voice;
     }
     // replace all non-sentence terminating characters with '--' which espeak will treat as a pause.
     // We preserve the other punctuation for cleaner chunking pre-tokenization
-    prompt = replace_any(prompt, ",;:", "--");
-    prompt = replace_any(prompt, "\n", " ");
-  	std::string phonemized_prompt = phmzr->text_to_phonemes(prompt);
+    std::string normalized{prompt};
+    normalized = replace_any(prompt, ",;:", "--");
+    normalized = replace_any(prompt, "\n", " ");
+    std::string phonemized_prompt = phmzr->text_to_phonemes(normalized);
 
-  	// Kokoro users a utf-8 single character tokenizer so if the size of the prompt is smaller than the max context length without the 
+  	// Kokoro users a utf-8 single character tokenizer so if the size of the prompt is smaller than the max context length without the
   	// beginning of sentence and end of sentence tokens then we can compute it all at once.
-  	if (phonemized_prompt.size() < model->max_context_length - 2) { 
+  	if (phonemized_prompt.size() < model->max_context_length - 2) {
   		// we preserved punctuation and Kokoro interprets these tokens as end of sentence tokens, so we have to remove them for all-at-once compute.
   		phonemized_prompt = strip(replace_any(phonemized_prompt, ".!?", ""));
   		if (phonemized_prompt.empty()) {
-  			return 0;
+  			return;
   		}
 		std::vector<uint32_t> tokens;
 		tokens.push_back(model->bos_token_id);
@@ -1425,32 +1426,26 @@ int kokoro_runner::generate(std::string prompt, struct tts_response * response,
 		run(batch, response);
   	} else {
   		// TODO: determine the performance to memory trade off in using a batched compute approach verse this chunking approach.
-  		// This approach is likely to be slower than a batched approach, but given the already huge memory overhead of Kokoro's graph it 
+  		// This approach is likely to be slower than a batched approach, but given the already huge memory overhead of Kokoro's graph it
   		// might be preferable to use this chunking approach.
   		std::vector<std::string> clauses = split(phonemized_prompt, ".!?");
   		for (auto tokens : tokenize_chunks(clauses)) {
 			kokoro_ubatch batch;
 			batch.n_tokens = tokens.size();
 			batch.input_tokens = tokens.data();
-			struct tts_response * partial = new tts_response;
+			tts_response partial{};
 			run(batch, partial);
 			append_to_response(response, partial);
 		}
   	}
-  	return 0;
 }
 
-std::vector<std::string> kokoro_runner::list_voices() {
-	std::vector<std::string> voices;
-	voices.reserve(model->voices.size());
-	for (auto voice : model->voices) {
-		voices.push_back(voice.first);
-	}
-	return voices;
+std::vector<std::string_view> kokoro_runner::list_voices() {
+    const auto voices{ views::keys(model->voices) | views::transform([](const auto & x) { return string_view{ x }; }) };
+    return std::vector(cbegin(voices), cend(voices));
 }
 
-
-std::string get_espeak_id_from_kokoro_voice(std::string voice) {
+const char * get_espeak_id_from_kokoro_voice(std::string voice) {
 	return !voice.empty() && KOKORO_LANG_TO_ESPEAK_ID.find(voice[0]) != KOKORO_LANG_TO_ESPEAK_ID.end() ? KOKORO_LANG_TO_ESPEAK_ID[voice[0]] : "gmw/en-US";
 }
 
diff --git a/src/kokoro_model.h b/src/models/kokoro/model.h
similarity index 91%
rename from src/kokoro_model.h
rename to src/models/kokoro/model.h
index b4f4f96..eed74aa 100644
--- a/src/kokoro_model.h
+++ b/src/models/kokoro/model.h
@@ -1,15 +1,23 @@
-#ifndef kokoro_model_h
-#define kokoro_model_h
+#pragma once
 
-#include <stdlib.h>
-#include "tts_model.h"
-#include "tokenizer.h"
+#include <cstdlib>
+
+#include "../../tokenizer.h"
+#include "../../tts_model.h"
+#include "models/loaders.h"
 #include "phonemizer.h"
 
+extern const struct kokoro_model_loader final : tts_model_loader {
+    explicit kokoro_model_loader();
+
+    unique_ptr<tts_generation_runner> from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads,
+                                                bool cpu_only, const generation_configuration & config) const override;
+} kokoro_loader;
+
 // Rather than using ISO 639-2 language codes, Kokoro voice pack specify their corresponding language via their first letter.
 // Below is a map that describes the relationship between those designations and espeak-ng's voice identifiers so that the 
 // appropriate phonemization protocol can inferred from the Kokoro voice.
-static std::map<char, std::string> KOKORO_LANG_TO_ESPEAK_ID = {
+static std::map<char, const char *> KOKORO_LANG_TO_ESPEAK_ID = {
 	{'a', "gmw/en-US"},
 	{'b', "gmw/en"},
 	{'e', "roa/es"},
@@ -283,7 +291,7 @@ struct kokoro_model : tts_model {
 
 
 	void post_load_assign();
-    void assign_weight(std::string name, ggml_tensor * tensor);
+    void assign_weight(const char * name, ggml_tensor & tensor);
     void prep_layers(gguf_context * meta);
     void prep_constants(gguf_context * meta);
     void setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only = true) {
@@ -344,9 +352,9 @@ static struct ggml_tensor * build_kokoro_generator_res_block(ggml_context * ctx,
 static struct ggml_tensor * build_noise_block(ggml_context * ctx, kokoro_noise_residual_block * block, struct ggml_tensor * x, struct ggml_tensor * style);
 static kokoro_generator_residual_block * build_res_block_from_file(gguf_context * meta, std::string base_config_key);
 static kokoro_noise_residual_block * build_noise_block_from_file(gguf_context * meta, int index);
-static kokoro_generator_upsample_block* kokoro_generator_upsample_block(gguf_context * meta, int index);
+static kokoro_generator_upsample_block * kokoro_generator_upsample_block(gguf_context * meta, int index);
 
-std::string get_espeak_id_from_kokoro_voice(std::string voice);
+const char * get_espeak_id_from_kokoro_voice(std::string voice);
 struct kokoro_duration_context * build_new_duration_kokoro_context(struct kokoro_model * model, int n_threads, bool use_cpu = true);
 
 struct kokoro_duration_response {
@@ -421,8 +429,8 @@ static struct ggml_tensor * build_sin_gen(ggml_context * ctx, kokoro_model * mod
 struct kokoro_context * build_new_kokoro_context(struct kokoro_model * model, int n_threads, bool use_cpu = true);
 
 // This manages the graph compilation of computation for the Kokoro model.
-struct kokoro_runner : tts_runner {
-    kokoro_runner(kokoro_model * model, kokoro_context * context, single_pass_tokenizer * tokenizer, kokoro_duration_runner * drunner, phonemizer * phmzr): model(model), kctx(context), tokenizer(tokenizer), drunner(drunner), phmzr(phmzr) {
+struct kokoro_runner : tts_generation_runner {
+    kokoro_runner(unique_ptr<kokoro_model> model, kokoro_context * context, single_pass_tokenizer * tokenizer, kokoro_duration_runner * drunner, phonemizer * phmzr): tts_generation_runner{kokoro_loader}, model{move(model)}, kctx(context), tokenizer(tokenizer), drunner(drunner), phmzr(phmzr) {
     	tts_runner::sampling_rate = 24000.0f;
     	tts_runner::supports_voices = true;
     };
@@ -432,12 +440,11 @@ struct kokoro_runner : tts_runner {
         }
         delete drunner;
         model->free();
-        delete model;
         delete kctx;
         delete phmzr;
     }
     struct single_pass_tokenizer * tokenizer;
-    kokoro_model * model;
+    unique_ptr<kokoro_model> model;
     kokoro_context * kctx;
     kokoro_duration_runner * drunner;
     phonemizer * phmzr;
@@ -448,15 +455,13 @@ struct kokoro_runner : tts_runner {
         tts_runner::init_build(&kctx->buf_compute_meta);
     }
 
-    std::vector<std::string> list_voices();
+    std::vector<std::string_view> list_voices() override;
     std::vector<std::vector<uint32_t>> tokenize_chunks(std::vector<std::string> clauses);
-    void assign_weight(std::string name, ggml_tensor * tensor);
+    void assign_weight(const char * name, ggml_tensor & tensor);
     void prepare_post_load();
     kokoro_ubatch build_worst_case_batch();
     void set_inputs(kokoro_ubatch & batch, uint32_t total_size);
     struct ggml_cgraph * build_kokoro_graph(kokoro_ubatch & batch);
-    void run(kokoro_ubatch & batch, struct tts_response * outputs);
-    int generate(std::string prompt, struct tts_response * response, std::string voice, std::string voice_code = "");
+    void run(kokoro_ubatch & batch, tts_response & outputs);
+    void generate(const char * prompt, tts_response & response, const generation_configuration & config);
 };
-
-#endif
diff --git a/src/phonemizer.cpp b/src/models/kokoro/phonemizer.cpp
similarity index 100%
rename from src/phonemizer.cpp
rename to src/models/kokoro/phonemizer.cpp
diff --git a/include/phonemizer.h b/src/models/kokoro/phonemizer.h
similarity index 100%
rename from include/phonemizer.h
rename to src/models/kokoro/phonemizer.h
diff --git a/src/models/loaders.cpp b/src/models/loaders.cpp
new file mode 100644
index 0000000..ace14de
--- /dev/null
+++ b/src/models/loaders.cpp
@@ -0,0 +1,84 @@
+#include "loaders.h"
+
+#include <cstring>
+#include <unordered_map>
+
+#include "common.h"
+#include "ggml-iterator.h"
+#include "ggml.h"
+#include "llama-mmap.h"
+
+static unordered_map<string_view, reference_wrapper<const tts_model_loader>> LOADERS;
+
+tts_model_loader::tts_model_loader(const char * arch) : arch{ arch } {
+    LOADERS.emplace(arch, ref(*this));
+}
+
+void dia_register();
+void kokoro_register();
+void orpheus_register();
+void parler_register();
+
+[[maybe_unused]] static bool loaders = [] {
+    dia_register();
+    kokoro_register();
+    orpheus_register();
+    parler_register();
+    return true;
+}();
+
+// currently only metal and cpu devices are supported,
+// so cpu_only only describes whether or not to try to load and run on metal.
+unique_ptr<tts_generation_runner> runner_from_file(const char * fname, int n_threads,
+                                                   const generation_configuration & config, bool cpu_only) {
+    static const bool use_mmap{ !getenv("OLLAMA_NO_MMAP") };  // TODO(danielzgtg) temporary, will be --no-mmap later
+    unique_ptr<llama_mmap> in_mmap{};
+    if (use_mmap) {
+        llama_file in_map_file{ fname, "r" };
+        in_mmap = make_unique<llama_mmap>(&in_map_file);
+    }
+    ggml_context * weight_ctx{};
+    gguf_context * meta_ctx = gguf_init_from_file(fname, {
+                                                             .no_alloc{ use_mmap },
+                                                             .ctx{ &weight_ctx },
+                                                         });
+    if (!meta_ctx) {
+        GGML_ABORT("gguf_init_from_file failed for file %s\n", fname);
+    }
+    if (use_mmap) {
+        const int n{ gguf_get_n_tensors(&*meta_ctx) };
+        int       i{};
+        void *    in_buffer{ static_cast<char *>(in_mmap->addr()) + gguf_get_data_offset(meta_ctx) };
+        for (ggml_tensor & cur : ggml_tensor_iterator{ *weight_ctx }) {
+            GGML_ASSERT(i < n);
+            GGML_ASSERT(!strcmp(cur.name, gguf_get_tensor_name(&*meta_ctx, i)));
+            cur.data = static_cast</*const*/ char *>(in_buffer) + gguf_get_tensor_offset(&*meta_ctx, i);
+            ++i;
+        }
+    }
+    const int          arch_key = gguf_find_key(meta_ctx, "general.architecture");
+    const char * const arch{ gguf_get_val_str(meta_ctx, arch_key) };
+    const auto         found = LOADERS.find(arch);
+    if (found == LOADERS.end()) {
+        GGML_ABORT("Unknown architecture %s\n", arch);
+    }
+    const auto &                      loader{ found->second.get() };
+    unique_ptr<tts_generation_runner> runner{ loader.from_file(meta_ctx, weight_ctx, n_threads, cpu_only, config) };
+    // TODO(mmwillet): change this weight assignment pattern to mirror llama.cpp
+    for (ggml_tensor & cur : ggml_tensor_iterator{ *weight_ctx }) {
+        if (!cur.data) {
+            continue;
+        }
+        if (!*cur.name) {
+            // handles the top level meta tensor
+            continue;
+        }
+        runner->assign_weight(cur.name, cur);
+    }
+    runner->prepare_post_load();
+    gguf_free(meta_ctx);
+    ggml_free(weight_ctx);
+    GGML_ASSERT(&runner->loader.get() == &loader);
+    runner->buf = move(in_mmap);
+    return runner;
+}
diff --git a/src/models/loaders.h b/src/models/loaders.h
new file mode 100644
index 0000000..7e48257
--- /dev/null
+++ b/src/models/loaders.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include "../../include/common.h"
+
+struct gguf_context;
+
+struct tts_model_loader {
+    /// Installs a model loader for the specified model architecture name
+    explicit tts_model_loader(const char * arch);
+    const char * const                        arch;
+    virtual unique_ptr<tts_generation_runner> from_file(
+        gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, bool cpu_only,
+        /* TODO move to generate() */ const generation_configuration & config) const = 0;
+  protected:
+    ~tts_model_loader() = default;
+};
+
+unique_ptr<tts_generation_runner> runner_from_file(const char * fname, int n_threads,
+                                                   const generation_configuration & config, bool cpu_only = true);
diff --git a/src/models/orpheus/CMakeLists.txt b/src/models/orpheus/CMakeLists.txt
new file mode 100644
index 0000000..2e86004
--- /dev/null
+++ b/src/models/orpheus/CMakeLists.txt
@@ -0,0 +1,5 @@
+target_sources(tts PRIVATE
+        loader.cpp
+        model.cpp
+        model.h
+)
diff --git a/src/models/orpheus/loader.cpp b/src/models/orpheus/loader.cpp
new file mode 100644
index 0000000..c6110a0
--- /dev/null
+++ b/src/models/orpheus/loader.cpp
@@ -0,0 +1,24 @@
+#include "../loaders.h"
+#include "model.h"
+
+void orpheus_register() {}
+
+orpheus_model_loader::orpheus_model_loader() : tts_model_loader{ "orpheus" } {}
+
+unique_ptr<tts_generation_runner> orpheus_model_loader::from_file(gguf_context * meta_ctx, ggml_context * weight_ctx,
+                                                                  int n_threads, bool cpu_only,
+                                                                  const generation_configuration & config) const {
+    orpheus_model * model       = new orpheus_model;
+    snac_model *    audio_model = new snac_model;
+    bpe_tokenizer * bt          = bpe_tokenizer_from_gguf(meta_ctx);
+    model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
+    audio_model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
+    sampler *          samp          = new sampler;
+    snac_context *     sctx          = build_new_snac_context(audio_model, n_threads, cpu_only);
+    snac_runner *      audio_decoder = new snac_runner(audio_model, sctx);
+    orpheus_context *  octx          = build_new_orpheus_context(model, n_threads, cpu_only);
+    orpheus_kv_cache * cache         = new orpheus_kv_cache;
+    return make_unique<orpheus_runner>(model, audio_decoder, octx, bt, samp, cache);
+}
+
+const orpheus_model_loader orpheus_loader{};
diff --git a/src/orpheus_model.cpp b/src/models/orpheus/model.cpp
similarity index 93%
rename from src/orpheus_model.cpp
rename to src/models/orpheus/model.cpp
index 4866af2..cda5793 100644
--- a/src/orpheus_model.cpp
+++ b/src/models/orpheus/model.cpp
@@ -1,4 +1,4 @@
-#include "orpheus_model.h"
+#include "model.h"
 
 #include <array>
 
@@ -386,7 +386,7 @@ std::vector<std::vector<uint32_t>> orpheus_runner::prepare_output_tokens() {
     return output_tokens;
 }
 
-void orpheus_runner::generate_from_batch(orpheus_ubatch & batch, struct tts_response * output) {
+void orpheus_runner::generate_from_batch(orpheus_ubatch & batch, tts_response & output) {
     while ((octx->output_tokens.size() == 0 || octx->output_tokens.back() != model->stopping_token_id) && octx->output_tokens.size() < model->max_generation_size) {
         decode(batch);
         generation_sampler->sample(octx->logits + octx->n_outputs * model->vocab_size, octx->output_tokens);
@@ -401,10 +401,21 @@ void orpheus_runner::generate_from_batch(orpheus_ubatch & batch, struct tts_resp
         fprintf(stdout, "Warning: generation hit its max default length. The generated audio may not contain the entire prompt.\n");
     }
     std::vector<std::vector<uint32_t>> processed_output_tokens = prepare_output_tokens();
-    srunner->run(processed_output_tokens, output);
+    srunner->run(processed_output_tokens, &output);
 }
 
-int orpheus_runner::generate(std::string sentence, struct tts_response * response) {
+void orpheus_runner::generate(const char * sentence, tts_response & response, const generation_configuration & config) {
+    generation_sampler->temperature        = config.temperature;
+    generation_sampler->repetition_penalty = config.repetition_penalty;
+    generation_sampler->do_sample          = config.sample;
+    generation_sampler->top_k              = config.top_k;
+    generation_sampler->top_p              = config.top_p;
+    if (std::find(orpheus_voices.begin(), orpheus_voices.end(), config.voice) == orpheus_voices.end() &&
+        !config.voice.empty()) {
+        TTS_ABORT("Voice '%s' is not a valid voice for Orpheus.", config.voice.c_str());
+    }
+    octx->voice = config.voice;
+
     orpheus_ubatch batch = batch_from_sentence(sentence);
     // it should be possible to update the max context window size, but currently it is extremely unlikely that a single prompt will
     // surpass the default size.
@@ -417,19 +428,6 @@ int orpheus_runner::generate(std::string sentence, struct tts_response * respons
         orpheus_kv_cache_init();
     }
     generate_from_batch(batch, response);
-    return 0;
-}
-
-void orpheus_runner::configure_generation(generation_configuration * config) {
-    generation_sampler->temperature = config->temperature;
-    generation_sampler->repetition_penalty = config->repetition_penalty;
-    generation_sampler->do_sample = config->sample;
-    generation_sampler->top_k = config->top_k;
-    generation_sampler->top_p = config->top_p;
-    if (std::find(orpheus_voices.begin(), orpheus_voices.end(), config->voice) == orpheus_voices.end() && !config->voice.empty()) {
-        TTS_ABORT("Voice '%s' is not a valid voice for Orpheus.", config->voice.c_str());
-    }
-    octx->voice = config->voice;
 }
 
 orpheus_ubatch orpheus_runner::build_worst_case_batch() {
@@ -438,22 +436,13 @@ orpheus_ubatch orpheus_runner::build_worst_case_batch() {
     return batch;
 }
 
-void orpheus_runner::assign_weight(std::string name, ggml_tensor * tensor) {
-    if (tensor->data == NULL) {
-        return;
-    }
-
-    if (name.size() == 0) {
-        // handles the top level meta tensor
-        return;
-    }
-
-    if (name.size() > 5 && name.substr(0, 5) == "snac.") {
-        srunner->model->assign_weight(name.substr(5), tensor);
-    } else if (name.size() > 8 && name.substr(0, 8) == "orpheus.") {
-        model->assign_weight(name.substr(8), tensor);
+void orpheus_runner::assign_weight(const char * name, ggml_tensor & tensor) {
+    if (const string_view name_sv{ name }; name_sv.starts_with("snac.")) {
+        srunner->model->assign_weight(string{ name_sv.substr(sizeof("snac.") - 1) }, &tensor);
+    } else if (name_sv.starts_with("orpheus.")) {
+        model->assign_weight(string{ name_sv.substr(sizeof("orpheus.") - 1) }, &tensor);
     } else {
-        fprintf(stdout, "Warning: function %s encountered an unhandled tensor named '%s'.\n", __func__, name.c_str());
+        fprintf(stdout, "Warning: function %s encountered an unhandled tensor named '%s'.\n", __func__, name);
     }
 }
 
@@ -465,11 +454,6 @@ void orpheus_runner::prepare_post_load() {
     octx->prep_schedule(gf);
 }
 
-std::vector<std::string> list_voices() {
-	std::vector<std::string> voices;
-	voices.reserve(orpheus_voices.size());
-	for (auto voice : orpheus_voices) {
-		voices.push_back(voice);
-	}
-	return voices;
+std::vector<std::string_view> orpheus_runner::list_voices() {
+    return vector<string_view>(cbegin(orpheus_voices), cend(orpheus_voices));
 }
diff --git a/src/orpheus_model.h b/src/models/orpheus/model.h
similarity index 81%
rename from src/orpheus_model.h
rename to src/models/orpheus/model.h
index 9f02d76..0e2bb94 100644
--- a/src/orpheus_model.h
+++ b/src/models/orpheus/model.h
@@ -1,8 +1,17 @@
 #pragma once
 
-#include "sampler.h"
-#include "tokenizer.h"
-#include "snac_model.h"
+#include "../../decoder/snac_model.h"
+#include "../../sampler.h"
+#include "../../tokenizer.h"
+#include "models/loaders.h"
+
+extern const struct orpheus_model_loader final : tts_model_loader {
+    explicit orpheus_model_loader();
+
+    unique_ptr<tts_generation_runner> from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads,
+                                                bool cpu_only, const generation_configuration & config) const override;
+} orpheus_loader;
+
 
 // Orpheus uses vLLM with a llama-3 architecture. The only critical difference from the normal llama architecture is the use of kv heads.
 
@@ -102,14 +111,14 @@ struct orpheus_ubatch {
     std::vector<uint32_t> tokens;    // [n_tokens]
 };
 
-struct orpheus_runner : tts_runner {
+struct orpheus_runner : tts_generation_runner {
     orpheus_runner(
             orpheus_model * model, 
             snac_runner * audio_decoder, 
             orpheus_context * octx, 
             bpe_tokenizer * bt, 
             sampler * samp, 
-            orpheus_kv_cache * cache): model(model), srunner(audio_decoder), octx(octx), tokenizer(bt), generation_sampler(samp), kv_self(cache) {
+            orpheus_kv_cache * cache): tts_generation_runner{orpheus_loader}, model(model), srunner(audio_decoder), octx(octx), tokenizer(bt), generation_sampler(samp), kv_self(cache) {
         tts_runner::sampling_rate = 24000.0f;
         generation_sampler->n_output_heads = 1;
         generation_sampler->vocab_size = model->vocab_size;
@@ -126,20 +135,19 @@ struct orpheus_runner : tts_runner {
         tts_runner::init_build(&octx->buf_compute_meta);
     }
 
-    std::vector<std::string> list_voices();
+    std::vector<std::string_view> list_voices() override;
     struct ggml_cgraph * build_orpheus_graph(orpheus_ubatch & batch);
     void orpheus_kv_cache_init();
     void orpheus_build_kv_store(struct ggml_context * ctx, struct ggml_cgraph * graph, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, int index, uint32_t n_tokens, int repeat);
-    void configure_generation(generation_configuration * config);
-    void assign_weight(std::string name, ggml_tensor * tensor);
+    void assign_weight(const char * name, ggml_tensor & tensor) override;
     std::vector<std::vector<uint32_t>> prepare_output_tokens();
     orpheus_ubatch build_worst_case_batch();
     orpheus_ubatch batch_from_sentence(std::string sentence);
     void set_inputs(orpheus_ubatch & batch);
     void decode(orpheus_ubatch & batch);
-    void prepare_post_load();
-    int generate(std::string sentence, struct tts_response * response);
-    void generate_from_batch(orpheus_ubatch & batch, struct tts_response * output);
+    void prepare_post_load() override;
+    void generate(const char * sentence, tts_response & response, const generation_configuration & config) override;
+    void generate_from_batch(orpheus_ubatch & batch, tts_response & output);
 };
 
 static struct ggml_tensor * orpheus_build_layer_norm(ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * weight);
diff --git a/src/models/parler/CMakeLists.txt b/src/models/parler/CMakeLists.txt
new file mode 100644
index 0000000..0a540f3
--- /dev/null
+++ b/src/models/parler/CMakeLists.txt
@@ -0,0 +1,6 @@
+target_sources(tts PRIVATE
+        loader.cpp
+        model.cpp
+        model.h
+)
+add_subdirectory(t5)
diff --git a/src/models/parler/loader.cpp b/src/models/parler/loader.cpp
new file mode 100644
index 0000000..881bafc
--- /dev/null
+++ b/src/models/parler/loader.cpp
@@ -0,0 +1,26 @@
+#include "../loaders.h"
+#include "model.h"
+
+void parler_register() {}
+
+parler_model_loader::parler_model_loader() : tts_model_loader{ "parler-tts" } {}
+
+unique_ptr<tts_generation_runner> parler_model_loader::from_file(gguf_context * meta_ctx, ggml_context * weight_ctx,
+                                                                 int n_threads, bool cpu_only,
+                                                                 const generation_configuration & config) const {
+    parler_tts_model *  model       = new parler_tts_model;
+    dac_model *         audio_model = new dac_model;
+    unigram_tokenizer * ut          = unigram_tokenizer_from_gguf(meta_ctx);
+    ut->initialize_tokenizer();
+    model->use_cross_attn = config.use_cross_attn;
+    model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
+    audio_model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
+    sampler *         samp          = new sampler;
+    dac_context *     dctx          = build_new_dac_context(audio_model, n_threads, cpu_only);
+    dac_runner *      audio_decoder = new dac_runner(audio_model, dctx);
+    parler_context *  pctx          = build_new_parler_context(model, n_threads, cpu_only);
+    parler_kv_cache * cache         = new parler_kv_cache;
+    return make_unique<parler_tts_runner>(model, audio_decoder, pctx, ut, samp, cache);
+}
+
+const parler_model_loader parler_loader{};
diff --git a/src/parler_model.cpp b/src/models/parler/model.cpp
similarity index 94%
rename from src/parler_model.cpp
rename to src/models/parler/model.cpp
index 7f4fec1..4d731d2 100644
--- a/src/parler_model.cpp
+++ b/src/models/parler/model.cpp
@@ -1,4 +1,4 @@
-#include "parler_model.h"
+#include "model.h"
 
 // For loading parler model from gguf file.
 static const std::map<std::string, parler_tensor> PARLER_TENSOR_GGUF_LOOKUP = {
@@ -336,10 +336,9 @@ struct parler_context * build_new_parler_context(struct parler_tts_model * model
     return pctx;
 }
 
-static bool parler_kv_cache_init(struct parler_kv_cache * cache, parler_tts_model * model, parler_context * pctx, int32_t seq_id) {
+static bool parler_kv_cache_init(struct parler_kv_cache * cache, parler_tts_model * model, parler_context * pctx) {
     const int64_t n_layer = (int64_t) model->layers.size();
-    cache->seq_id = seq_id;
-    
+
     ggml_backend_buffer_type_t buft = nullptr;
     // this will only really support cpu or metal for the time being;
     if (pctx->backend != nullptr) {
@@ -498,32 +497,26 @@ static struct parler_ubatch batch_from_sentence(std::string sentence, parler_tts
     return batch;
 }
 
-void parler_tts_runner::assign_weight(std::string name, ggml_tensor * tensor) {
-    std::string::size_type pos = name.find(".", 0);
-    std::string top_level(name.substr(0, pos));
-    std::string value(name.substr(pos + 1));
-    if (tensor->data == NULL) {
-        return;
-    }
-    if (top_level == "audio_encoder") {
-        dac_runner->model->assign_weight(value, tensor);
-    } else if (top_level == "decoder") {
-        model->assign_weight(value, tensor);
+void parler_tts_runner::assign_weight(const char * name, ggml_tensor & tensor) {
+    if (const string_view name_sv{ name }; name_sv.starts_with("audio_encoder.")) {
+        dac_runner->model->assign_weight(string{ name_sv.substr(sizeof("audio_encoder.") - 1) }, &tensor);
+    } else if (name_sv.starts_with("decoder.")) {
+        model->assign_weight(string{ name_sv.substr(sizeof("decoder.") - 1) }, &tensor);
     } else {
-        return;
+        fprintf(stdout, "Warning: function %s encountered an unhandled tensor named '%s'.\n", __func__, name);
     }
 }
 
-void parler_tts_runner::update_conditional_prompt(const std::string file_path, const std::string prompt, int n_threads, bool cpu_only) {
+void parler_tts_runner::update_conditional_prompt(const char * file_path, const char * prompt) {
+    const int      n_threads{ pctx->n_threads };
+    constexpr bool cpu_only{true}; // TODO
     t5_runner * text_encoder = text_encoder_from_file(file_path, n_threads, tokenizer, cpu_only);
     tts_response* response;
     text_encoder->generate(prompt, response);
     model->prep_cross_key_values(n_threads, response);
     delete text_encoder;
-    return;
 }
 
-
 struct ggml_cgraph * parler_tts_runner::build_parler_graph(parler_ubatch & batch) {
     init_build();
     struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false);
@@ -620,15 +613,6 @@ struct ggml_cgraph * parler_tts_runner::build_parler_graph(parler_ubatch & batch
     return gf;
 }
 
-void parler_tts_runner::configure_generation(generation_configuration * config) {
-    sampler->temperature = config->temperature;
-    sampler->repetition_penalty = config->repetition_penalty;
-    sampler->do_sample = config->sample;
-    sampler->top_k = config->top_k;
-    sampler->top_p = config->top_p;
-    model->use_cross_attn = config->use_cross_attn;
-}
-
 void parler_tts_runner::set_inputs(parler_ubatch & batch) {
     if (batch.audio_generation) {
         ggml_backend_tensor_set(pctx->audio_inp_tokens, batch.audio_tokens, 0, batch.n_audio_tokens*ggml_element_size(pctx->audio_inp_tokens));
@@ -718,17 +702,16 @@ parler_ubatch parler_tts_runner::build_worst_case_batch()  {
 }
 
 void parler_tts_runner::prepare_post_load() {
+    if (model->use_cross_attn) {
+        model->prep_cross_key_values(pctx->n_threads);
+    }
     dac_runner->prepare_post_load();
-    parler_kv_cache_init(kv_self, model, pctx, std::mt19937(std::random_device{}())());
+    parler_kv_cache_init(kv_self, model, pctx);
     auto batch = build_worst_case_batch();
     auto gf = build_parler_graph(batch);
     pctx->prep_schedule(gf);
 }
 
-bool parler_tts_runner::adjust_for_sequence_continuation(struct parler_ubatch & batch) {
-    return false; // not implemneted
-}
-
 bool parler_tts_runner::check_stopping() {
     int32_t token_position = (int32_t) pctx->output_tokens.size() - (int32_t) model->n_output_heads;
     if (token_position < 0) {
@@ -776,7 +759,7 @@ void parler_tts_runner::adjust_output_tokens(std::vector<uint32_t> & output_toke
     }
 }
 
-int parler_tts_runner::generate_from_batch(parler_ubatch & batch, struct tts_response * output) {
+int parler_tts_runner::generate_from_batch(parler_ubatch & batch, tts_response & output) {
     std::vector<uint32_t> next_decoder_token_ids;
     next_decoder_token_ids.reserve(model->n_output_heads);
 
@@ -804,7 +787,7 @@ int parler_tts_runner::generate_from_batch(parler_ubatch & batch, struct tts_res
 
     std::vector<uint32_t> filtered_output_tokens;
     adjust_output_tokens(pctx->output_tokens, filtered_output_tokens);
-    dac_runner->run(filtered_output_tokens.data(), (int32_t) filtered_output_tokens.size() / model->n_output_heads, output);
+    dac_runner->run(filtered_output_tokens.data(), (int32_t) filtered_output_tokens.size() / model->n_output_heads, &output);
     return 0;
 }
 
@@ -815,7 +798,7 @@ int parler_tts_runner::generate_audio_tokens(std::string sentence) {
     int32_t seq_id = std::mt19937(std::random_device{}())();
     if (!kv_self) {
         kv_self = new parler_kv_cache;
-        if (!parler_kv_cache_init(kv_self, model, pctx, seq_id)) {
+        if (!parler_kv_cache_init(kv_self, model, pctx)) {
             return 1;
         }
     }
@@ -852,23 +835,24 @@ void parler_tts_runner::just_audio_token_decode(uint32_t * tokens, int32_t sq_le
     dac_runner->run(tokens, sq_len, outputs);
 }
 
-int parler_tts_runner::generate(std::string sentence, struct tts_response * output, int32_t seq_id) {
+void parler_tts_runner::generate(const char * sentence, tts_response & output,
+                                 const generation_configuration & config) {
+    sampler->temperature        = config.temperature;
+    sampler->repetition_penalty = config.repetition_penalty;
+    sampler->do_sample          = config.sample;
+    sampler->top_k              = config.top_k;
+    sampler->top_p              = config.top_p;
+    model->use_cross_attn       = config.use_cross_attn;
+
     parler_ubatch batch = batch_from_sentence(sentence, model, tokenizer);
     pctx->reset(model->n_output_heads);
     sampler->reset();
-    if (pctx->seq_id != seq_id || seq_id == -1) {
-        seq_id = std::mt19937(std::random_device{}())();
-        pctx->current_position = 0;
-        if (!kv_self) {
-            kv_self = new parler_kv_cache;
-            if (!parler_kv_cache_init(kv_self, model, pctx, seq_id)) {
-                return 1;
-            }
-        }
-    } else {
-        if (!adjust_for_sequence_continuation(batch)) {
-            return 2;
+    pctx->current_position = 0;
+    if (!kv_self) {
+        kv_self = new parler_kv_cache;
+        if (!parler_kv_cache_init(kv_self, model, pctx)) {
+            return;
         }
     }
-    return generate_from_batch(batch, output);
+    generate_from_batch(batch, output);
 }
diff --git a/src/parler_model.h b/src/models/parler/model.h
similarity index 88%
rename from src/parler_model.h
rename to src/models/parler/model.h
index 463910f..529db71 100644
--- a/src/parler_model.h
+++ b/src/models/parler/model.h
@@ -1,9 +1,16 @@
-#ifndef parler_model_h
-#define parler_model_h
+#pragma once
 
-#include "dac_model.h"
-#include "t5_encoder_model.h"
-#include "sampler.h"
+#include "../../decoder/dac_model.h"
+#include "../../sampler.h"
+#include "models/loaders.h"
+#include "t5/model.h"
+
+extern const struct parler_model_loader final : tts_model_loader {
+    explicit parler_model_loader();
+
+    unique_ptr<tts_generation_runner> from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads,
+                                                bool cpu_only, const generation_configuration & config) const override;
+} parler_loader;
 
 enum parler_tensor {
     PARLER_EMBD,
@@ -112,8 +119,7 @@ struct parler_context : runner_context {
     int32_t n_outputs   = 0; // number of actually-used outputs in the current ubatch or last logical batch
     uint32_t current_position = 0; // current position in the active sequence
     uint32_t prompt_end_position = 0; // the position of the text prompt termination (used for adjusting the cache when incrementally generating)
-    int32_t seq_id; // a unique identifier associated with the active sequence.
-    
+
     std::vector<uint32_t> output_tokens;
     
     struct ggml_tensor * inp_tokens;
@@ -129,8 +135,6 @@ struct parler_context : runner_context {
 };
 
 struct parler_kv_cache {
-    int32_t seq_id;
-    
     ggml_type type_k = GGML_TYPE_F32;
     ggml_type type_v = GGML_TYPE_F32;
 
@@ -168,7 +172,7 @@ struct parler_ubatch {
 };
 
 struct parler_context * build_new_parler_context(struct parler_tts_model * model, int n_threads, bool use_cpu = true);
-static bool parler_kv_cache_init(struct parler_kv_cache * cache, parler_tts_model * model, parler_context * pctx, int32_t seq_id);
+static bool parler_kv_cache_init(struct parler_kv_cache * cache, parler_tts_model * model, parler_context * pctx);
 
 struct ggml_tensor * parler_build_inp_embd(struct ggml_context * ctx, struct parler_context * pctx, parler_tts_model * model, const parler_ubatch & batch);
 struct ggml_tensor * parler_build_layer_norm(struct ggml_context * ctx, struct ggml_tensor * inputs, struct ggml_tensor * weight, struct ggml_tensor * bias);
@@ -180,8 +184,8 @@ static struct parler_ubatch batch_from_sentence(std::string sentence, parler_tts
 
 // This struct is intended to support end-to-end TTS generation. As such, it manages the parler tts model compilation, compute and generation process,
 // the tokenization and sampling process, and uses the dac_runner struct to encode audio outputs.
-struct parler_tts_runner : tts_runner {
-    parler_tts_runner(parler_tts_model * model, dac_runner * audio_decoder, parler_context * pctx, unigram_tokenizer * ut, sampler * samp, parler_kv_cache * cache): model(model), dac_runner(audio_decoder), pctx(pctx), tokenizer(ut), sampler(samp), kv_self(cache) {};
+struct parler_tts_runner : tts_generation_runner {
+    parler_tts_runner(parler_tts_model * model, dac_runner * audio_decoder, parler_context * pctx, unigram_tokenizer * ut, sampler * samp, parler_kv_cache * cache): tts_generation_runner{parler_loader}, model(model), dac_runner(audio_decoder), pctx(pctx), tokenizer(ut), sampler(samp), kv_self(cache) {};
     ~parler_tts_runner() {
         if (ctx) {
             ggml_free(ctx);
@@ -204,22 +208,18 @@ struct parler_tts_runner : tts_runner {
         tts_runner::init_build(&pctx->buf_compute_meta);
     }
 
-    void configure_generation(generation_configuration * config);
-    void assign_weight(std::string name, ggml_tensor * tensor);
+    void assign_weight(const char * name, ggml_tensor & tensor) override;
     parler_ubatch build_worst_case_batch();
     struct ggml_cgraph * build_parler_graph(parler_ubatch & batch);
     void set_inputs(parler_ubatch & batch);
     int decode(parler_ubatch & batch);
-    void prepare_post_load();
-    bool adjust_for_sequence_continuation(struct parler_ubatch & batch);
-    int generate(std::string sentence, struct tts_response * response, int32_t seq_id = -1);
+    void prepare_post_load() override;
+    void generate(const char * sentence, tts_response & output, const generation_configuration & config) override;
     bool check_stopping();
     void adjust_output_tokens(std::vector<uint32_t> & output_tokens, std::vector<uint32_t> & filtered);
-    int generate_from_batch(parler_ubatch & batch, struct tts_response * output);
+    int generate_from_batch(parler_ubatch & batch, tts_response & output);
     void parler_graph_compute(ggml_cgraph * gf);
     void just_audio_token_decode(uint32_t * tokens, int32_t sq_len, struct tts_response * output);
     int generate_audio_tokens(std::string sentence);
-    void update_conditional_prompt(const std::string file_path, const std::string prompt, int n_threads, bool cpu_only = true);
+    void update_conditional_prompt(const char * file_path, const char * prompt) override;
 };
-
-#endif
diff --git a/src/models/parler/t5/CMakeLists.txt b/src/models/parler/t5/CMakeLists.txt
new file mode 100644
index 0000000..7ba20a4
--- /dev/null
+++ b/src/models/parler/t5/CMakeLists.txt
@@ -0,0 +1,4 @@
+target_sources(tts PRIVATE
+        model.cpp
+        model.h
+)
diff --git a/src/t5_encoder_model.cpp b/src/models/parler/t5/model.cpp
similarity index 99%
rename from src/t5_encoder_model.cpp
rename to src/models/parler/t5/model.cpp
index 2dbc761..3751278 100644
--- a/src/t5_encoder_model.cpp
+++ b/src/models/parler/t5/model.cpp
@@ -1,4 +1,4 @@
-#include "t5_encoder_model.h"
+#include "model.h"
 
 static const std::map<std::string, t5_tensor> T5_TENSOR_GGUF_LOOKUP = {
     {"t5encoder.token_embd", T5_EMBD},
@@ -139,7 +139,7 @@ void t5_encoder::prep_constants(gguf_context * meta) {
     int bos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.bos_token_id");
     if (bos_token_id_key != -1) {
         bos_token_id = gguf_get_val_u32(meta, bos_token_id_key);
-    }    
+    }
 
     int eos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.eos_token_id");
     if (eos_token_id_key != -1) {
@@ -219,7 +219,7 @@ struct ggml_cgraph * t5_runner::build_t5_graph(t5_ubatch & batch) {
 
     struct ggml_tensor * cur;
     struct ggml_tensor * inpL;
-    
+
     //t5ctx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
     //ggml_set_input(t5ctx->positions);
 
@@ -233,7 +233,7 @@ struct ggml_cgraph * t5_runner::build_t5_graph(t5_ubatch & batch) {
 
     struct ggml_tensor * KQ_mask_dec = build_t5_attn_mask(ctx, t5ctx, batch);
     struct ggml_tensor * pos_bias = build_t5_pos_bias(ctx, t5ctx->inp_pos_bucket, model->relative_attn_bias);
-    
+
     for (int l = 0; l < model->n_layers; l++) {
         struct ggml_tensor * residual = inpL;
 
@@ -293,7 +293,7 @@ struct ggml_cgraph * t5_runner::build_t5_graph(t5_ubatch & batch) {
     ggml_build_forward_expand(gf, cur);
 
     free_build();
-    
+
     return gf;
 }
 
@@ -312,7 +312,7 @@ void t5_runner::set_inputs(t5_ubatch & batch) {
         for (int ii = 0; ii < batch.n_tokens; ii++) {
         	int ab_rpos = abs(i - ii);
         	int rpos = i - ii;
-            attn_mask[i*batch.n_tokens + ii] = 0.0f; //ii > i ? -INFINITY : 0.0f; 
+            attn_mask[i*batch.n_tokens + ii] = 0.0f; //ii > i ? -INFINITY : 0.0f;
             pos_bucket[i*batch.n_tokens + ii] = (uint32_t) (rpos > 0 ? n_buckets : 0) + (ab_rpos < max_exact ? ab_rpos : std::min((n_buckets - 1), (max_exact + (int)((log((ab_rpos / max_exact)) / logarithmic_denominator) * max_exact))));
         }
     }
@@ -324,10 +324,10 @@ void t5_runner::run(uint32_t * input_tokens, uint32_t sequence_length, struct tt
     batch.input_tokens = input_tokens;
     batch.n_tokens = sequence_length;
     ggml_backend_sched_reset(t5ctx->sched);
-    
+
     const size_t prev_size = t5ctx->buf_output ? ggml_backend_buffer_get_size(t5ctx->buf_output) : 0;
     const size_t new_size = model->max_context_length * model->output_size * sizeof(float);
-    
+
     if (!t5ctx->buf_output || prev_size < new_size) {
         if (t5ctx->buf_output) {
             ggml_backend_buffer_free(t5ctx->buf_output);
@@ -337,7 +337,7 @@ void t5_runner::run(uint32_t * input_tokens, uint32_t sequence_length, struct tt
 
         t5ctx->buf_output = ggml_backend_buft_alloc_buffer(t5ctx->backend_cpu_buffer, new_size);
     }
-    
+
     outputs->data = (float *) ggml_backend_buffer_get_base(t5ctx->buf_output);
     ggml_backend_buffer_clear(t5ctx->buf_output, 0);
     struct ggml_cgraph * gf = NULL;
diff --git a/src/t5_encoder_model.h b/src/models/parler/t5/model.h
similarity index 97%
rename from src/t5_encoder_model.h
rename to src/models/parler/t5/model.h
index 9a80187..423ccce 100644
--- a/src/t5_encoder_model.h
+++ b/src/models/parler/t5/model.h
@@ -1,9 +1,7 @@
-#ifndef t5_encoder_model_h
-#define t5_encoder_model_h
-
-#include "tts_model.h"
-#include "tokenizer.h"
+#pragma once
 
+#include "../../../tokenizer.h"
+#include "../../../tts_model.h"
 
 enum t5_tensor {
     T5_EMBD,
@@ -126,5 +124,3 @@ struct t5_runner : tts_runner {
 };
 
 struct t5_runner * text_encoder_from_file(std::string file_path, int n_threads, unigram_tokenizer * tokenizer, bool cpu_only = true);
-
-#endif
diff --git a/src/tts.cpp b/src/tts.cpp
deleted file mode 100644
index f5faf28..0000000
--- a/src/tts.cpp
+++ /dev/null
@@ -1,445 +0,0 @@
-#include "tts.h"
-#include <mutex>
-
-// A list of all of the top level GGUF names under kokoro.duration_predictor that have quantization compatible tensors.
-static constexpr std::array<const char *, 5> DURATION_PREDICTOR_QUANTIZATION_COMPATIBLE_PARTS = {
-    "duration_proj",
-    "encode",
-    "shared_lstm",
-    "duration_lstm",
-    "layers"
-};
-
-struct tts_runner * orpheus_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) {
-    orpheus_model * model = new orpheus_model;
-    snac_model * audio_model = new snac_model;
-    bpe_tokenizer * bt = bpe_tokenizer_from_gguf(meta_ctx);
-    model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
-    audio_model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
-    sampler * samp = new sampler;
-    snac_context * sctx = build_new_snac_context(audio_model, n_threads, cpu_only);
-    snac_runner * audio_decoder = new snac_runner(audio_model, sctx);
-    orpheus_context * octx = build_new_orpheus_context(model, n_threads, cpu_only);
-    orpheus_kv_cache * cache = new orpheus_kv_cache;
-    orpheus_runner * runner = new orpheus_runner(model, audio_decoder, octx, bt, samp, cache);
-
-    for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) {
-        runner->assign_weight(cur->name, cur);
-    }
-
-    runner->prepare_post_load();
-
-    gguf_free(meta_ctx);
-    ggml_free(weight_ctx);
-    runner->arch = arch;
-
-    return (tts_runner*)runner;
-}
-
-struct tts_runner * parler_tts_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) {
-    parler_tts_model * model = new parler_tts_model;
-    dac_model * audio_model = new dac_model;
-    unigram_tokenizer * ut = unigram_tokenizer_from_gguf(meta_ctx);
-    ut->initialize_tokenizer();
-    model->use_cross_attn = config->use_cross_attn;
-    model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
-    audio_model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
-    struct sampler * samp = new sampler;
-    struct dac_context * dctx = build_new_dac_context(audio_model, n_threads, cpu_only);
-    struct dac_runner * audio_decoder = new dac_runner(audio_model, dctx);
-    struct parler_context * pctx = build_new_parler_context(model, n_threads, cpu_only);
-    struct parler_kv_cache * cache = new parler_kv_cache;
-    struct parler_tts_runner * runner = new parler_tts_runner(model, audio_decoder, pctx, ut, samp, cache);
-
-    // TODO: change this weight assignment pattern to mirror llama.cpp
-    for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) {
-        runner->assign_weight(cur->name, cur);
-    }
-
-    if (config->use_cross_attn) {
-        runner->model->prep_cross_key_values(n_threads);
-    }
-
-    runner->prepare_post_load();
-
-    gguf_free(meta_ctx);
-    ggml_free(weight_ctx);
-    runner->arch = arch;
-
-    return (tts_runner*)runner;
-}
-
-struct tts_runner * kokoro_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) {
-    kokoro_model * model = new kokoro_model;
-    single_pass_tokenizer * spt = single_pass_tokenizer_from_gguf(meta_ctx, "tokenizer.ggml.tokens");
-    model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
-    struct kokoro_duration_context * kdctx = build_new_duration_kokoro_context(model, n_threads, cpu_only);
-    struct kokoro_duration_runner * duration_runner = new kokoro_duration_runner(model, kdctx, spt);
-    struct kokoro_context * kctx = build_new_kokoro_context(model, n_threads, cpu_only);
-    // if an espeak voice id wasn't specifically set infer it from the kokoro voice, if it was override it, otherwise fallback to American English.
-    std::string espeak_voice_id = config->espeak_voice_id;
-    if (espeak_voice_id.empty()) {
-        espeak_voice_id = !config->voice.empty() && KOKORO_LANG_TO_ESPEAK_ID.find(config->voice.at(0)) != KOKORO_LANG_TO_ESPEAK_ID.end() ? KOKORO_LANG_TO_ESPEAK_ID[config->voice.at(0)] : "gmw/en-US";
-    }
-    struct phonemizer * phmzr = phonemizer_from_gguf(meta_ctx, espeak_voice_id);
-    struct kokoro_runner * runner = new kokoro_runner(model, kctx, spt, duration_runner, phmzr);
-
-    // TODO: change this weight assignment pattern to mirror llama.cpp
-    for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) {
-        runner->assign_weight(cur->name, cur);
-    }
-
-    runner->prepare_post_load();
-
-    gguf_free(meta_ctx);
-    ggml_free(weight_ctx);
-    runner->arch = arch;
-
-    return (tts_runner*)runner;
-}
-
-struct tts_runner * dia_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) {
-    dia_model * model = new dia_model;
-    dac_model * audio_model = new dac_model;
-    model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
-    audio_model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
-    struct sampler * samp = new sampler;
-    struct dac_context * dctx = build_new_dac_context(audio_model, n_threads, cpu_only);
-    struct dac_runner * audio_decoder = new dac_runner(audio_model, dctx);
-    struct dia_context * diactx = build_new_dia_context(model, n_threads, cpu_only);
-    struct dia_kv_cache * cache = new dia_kv_cache;
-    struct dia_runner * runner = new dia_runner(model, audio_decoder, diactx, samp, cache);
-
-    for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) {
-        runner->assign_weight(cur->name, cur);
-    }
-
-    runner->prepare_post_load();
-
-    gguf_free(meta_ctx);
-    ggml_free(weight_ctx);
-    runner->arch = arch;
-
-    return (tts_runner*)runner;
-}
-
-// currently only metal and cpu devices are supported, so cpu_only only describes whether or not to try to load and run on metal.
-struct tts_runner * runner_from_file(const std::string & fname, int n_threads, generation_configuration * config, bool cpu_only) {
-    ggml_context * weight_ctx = NULL;
-
-    struct gguf_init_params params = {
-        /*.no_alloc   =*/ false,
-        /*.ctx        =*/ &weight_ctx,
-    };
-    gguf_context * meta_ctx = gguf_init_from_file(fname.c_str(), params);
-    if (!meta_ctx) {
-        TTS_ABORT("%s failed for file %s\n", __func__, fname.c_str());
-    }
-    int arch_key = gguf_find_key(meta_ctx, "general.architecture");
-    if (arch_key == -1) {
-        TTS_ABORT("%s failed for file %s. No architecture is set.\n", __func__, fname.c_str());
-    }
-    std::string arch = std::string(gguf_get_val_str(meta_ctx, arch_key));
-    if (SUPPORTED_ARCHITECTURES.find(arch) == SUPPORTED_ARCHITECTURES.end()) {
-        TTS_ABORT("%s failed for file %s. The architecture '%s' is not supported.", __func__, fname.c_str(), arch.c_str());
-    }
-    tts_arch arch_type = SUPPORTED_ARCHITECTURES.at(arch);
-    switch(arch_type) {
-        case PARLER_TTS_ARCH:
-            return parler_tts_from_file(meta_ctx, weight_ctx, n_threads, config, arch_type, cpu_only);
-        case KOKORO_ARCH:
-            return kokoro_from_file(meta_ctx, weight_ctx, n_threads, config, arch_type, cpu_only);
-        case DIA_ARCH:
-            return dia_from_file(meta_ctx, weight_ctx, n_threads, config, arch_type, cpu_only);
-        case ORPHEUS_ARCH:
-            return orpheus_from_file(meta_ctx, weight_ctx, n_threads, config, arch_type, cpu_only);
-        default:
-            TTS_ABORT("%s failed for file %s. The architecture '%s' is not supported.", __func__, fname.c_str(), arch.c_str());
-    }
-}
-
-int generate(tts_runner * runner, std::string sentence, struct tts_response * response, generation_configuration * config) {
-    switch(runner->arch) {
-        case PARLER_TTS_ARCH:
-            ((parler_tts_runner*)runner)->configure_generation(config);
-            return ((parler_tts_runner*)runner)->generate(sentence, response);
-        case KOKORO_ARCH:
-            return ((kokoro_runner*)runner)->generate(sentence, response, config->voice, config->espeak_voice_id);
-        case DIA_ARCH:
-            ((dia_runner*)runner)->configure_generation(config);
-            return ((dia_runner*)runner)->generate(sentence, response);
-        case ORPHEUS_ARCH:
-            ((orpheus_runner*)runner)->configure_generation(config);
-            return ((orpheus_runner*)runner)->generate(sentence, response);
-        default:
-            TTS_ABORT("%s failed. The architecture '%d' is not supported.", __func__, runner->arch);
-    }
-}
-
-std::vector<std::string> list_voices(tts_runner * runner) {
-    switch(runner->arch) {
-        case KOKORO_ARCH:
-            return ((kokoro_runner*)runner)->list_voices();
-        default:
-            TTS_ABORT("%s failed. The architecture '%d' does not support #list_voices supported.", __func__, runner->arch);
-    }
-}
-
-void update_conditional_prompt(tts_runner * runner, const std::string file_path, const std::string prompt, bool cpu_only) {
-    int n_threads = ((parler_tts_runner*)runner)->pctx->n_threads;
-    ((parler_tts_runner*)runner)->update_conditional_prompt(file_path, prompt, n_threads, cpu_only);
-}
-
-bool kokoro_is_f16_compatible(std::string name) {
-    return name.find("voice_tensors") == std::string::npos && 
-           name.find("bias") == std::string::npos &&
-           name.find("gamma") == std::string::npos &&
-           name.find("beta") == std::string::npos &&
-           name.find("alpha") == std::string::npos &&
-           !has_suffix(name, "embd") &&
-           !has_suffix(name, "norm");
-}
-
-bool kokoro_is_quantizable(std::string name, struct quantization_params * params) {
-    if (kokoro_is_f16_compatible(name)) {
-        if (has_prefix(name, "kokoro.albert") || has_prefix(name, "kokoro.text_encoder.lstm")) {
-            return true;
-        } else if (has_prefix(name, "kokoro.duration_predictor.")) {
-            std::vector<std::string> parts = split(name, ".");
-            for (std::string part : DURATION_PREDICTOR_QUANTIZATION_COMPATIBLE_PARTS) {
-                if (part == parts[2]) {
-                    return true;
-                }
-            }
-        }
-    }
-    return false;
-}
-
-bool dia_is_quantizable(std::string name, struct quantization_params * params) {
-    // The DAC audio encoder / decoder is not compatible with quantization and normalization tensors should not be quantized.
-    bool quantizable = !has_prefix(name, "audio_encoder") && !has_suffix(name, "norm");
-    if (!params->quantize_output_heads) {
-        quantizable = quantizable && !has_prefix(name, "dia.decoder.heads");
-    }
-    return quantizable;
-}
-
-bool parler_is_quanitizable(std::string name, struct quantization_params * params) {
-    // the DAC audio encoder / decoder is not compatible with quantization, normalization weight shouldn't be quantized, and the text encoding shouldn't be normalized.
-    bool quantizable = !has_prefix(name, "audio_encoder") && !has_suffix(name, "norm.weight") && !has_suffix(name, "text_encoding") && !has_suffix(name, "positional_embed") && !has_suffix(name, "norm.bias");
-    if (!params->quantize_output_heads) {
-        quantizable = quantizable && !has_suffix(name, "weight.head");
-    }
-    if (!params->quantize_text_embeddings) {
-        quantizable = quantizable && !has_suffix(name, "embed_prompts");
-    }
-    if (!params->quantize_cross_attn_kv) {
-        quantizable = quantizable && !has_suffix(name, "encoder_attn.k_proj.weight") && !has_suffix(name, "encoder_attn.v_proj.weight");   
-    }
-    return quantizable;
-}
-
-bool is_quantizable(tts_arch arch, std::string name, struct quantization_params * params) {
-    switch(arch) {
-        case PARLER_TTS_ARCH:
-            return parler_is_quanitizable(name, params);
-        case DIA_ARCH:
-            return dia_is_quantizable(name, params);
-        case KOKORO_ARCH:
-            return kokoro_is_quantizable(name, params);
-        default:
-            TTS_ABORT("%s failed. The architecture '%d' is not supported.", __func__, arch);
-    }
-}
-
-size_t quantize_tensor(void * new_data, struct ggml_tensor * tensor, const float * imatrix, enum ggml_type qtype, uint32_t n_threads) {
-    // much of this is form copied from llama.cpp
-    int chunk_size_multiplier = 1;
-    if (qtype == GGML_TYPE_Q4_0_4_4 || qtype == GGML_TYPE_Q4_0_4_8 || qtype == GGML_TYPE_Q4_0_8_8) {
-        if ((qtype == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) qtype = GGML_TYPE_Q4_0;
-        else if (tensor->ne[1] % 4 != 0) qtype = GGML_TYPE_Q4_0;
-        if (qtype == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
-        else if (qtype == GGML_TYPE_Q4_0_4_4 || qtype == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
-    }
-    size_t out_size = 0;
-    const int32_t d3_step = tensor->ne[0] * tensor->ne[1];
-    const int32_t n_per_row = tensor->ne[0];
-    const int32_t nrows = tensor->ne[1];
-    static const int32_t min_chunk_size = 32 * 512;
-    const int32_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) * chunk_size_multiplier;
-    uint32_t thread_count = std::max(1, std::min((int)n_threads, (int)(d3_step + chunk_size - 1) / chunk_size));
-    std::mutex mutex;
-
-    for (int32_t d3_index = 0; d3_index < tensor->ne[2]; d3_index++) {
-        const float * f32_data_d3 = ((float *) tensor->data) + d3_index * d3_step;
-        void * new_data_d3 = (char *)new_data + ggml_row_size(qtype, tensor->ne[0]) * d3_index * nrows;
-        const float * imatrix_03 = imatrix ? imatrix + d3_index * tensor->ne[0] : nullptr;
-        if (thread_count <= 1) {
-            // not threaded
-            out_size += ggml_quantize_chunk(qtype, f32_data_d3, new_data_d3, 0, nrows, n_per_row, imatrix);
-        } else {
-            std::vector <std::thread> threads;
-            int64_t counter = 0;
-            size_t new_size = 0;
-            bool valid = true;
-            for (uint32_t t = 0; t < thread_count; t++) {
-                auto func = [&mutex, &counter, &new_size, &valid, qtype, f32_data_d3, new_data_d3, chunk_size, nrows, n_per_row, imatrix]() {
-                    const int64_t nrows_per_chunk = chunk_size / n_per_row;
-                    size_t local_size = 0;
-                    while (true) {
-                        std::unique_lock<std::mutex> lock(mutex);
-                        int64_t first_row = counter; 
-                        counter += nrows_per_chunk;
-                        if (first_row >= nrows) {
-                            if (local_size > 0) {
-                                new_size += local_size;
-                            }
-                            break;
-                        }
-                        lock.unlock();
-                        const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
-                        size_t this_size = ggml_quantize_chunk(qtype, f32_data_d3, new_data_d3, first_row * n_per_row, this_nrow, n_per_row, imatrix);
-                        local_size += this_size;
-
-                        // validate the quantized data; I am not sure how this would occur, but there is always the safe fallback on doing this single threaded.
-                        const size_t row_size  = ggml_row_size(qtype, n_per_row);
-                        void * this_data = (char *) new_data_d3 + first_row * row_size;
-                        if (!ggml_validate_row_data(qtype, this_data, this_size)) {
-                            std::unique_lock<std::mutex> lock(mutex);
-                            valid = false;
-                            break;
-                        }
-                    }
-                };
-                threads.push_back(std::thread(func));
-            }
-            for (auto & t : threads) t.join();
-
-            if (!valid) {
-                TTS_ABORT("Validation of quantized data failed. Please try again and/or switch to single thread quantization.\n");
-            }
-            out_size += new_size;
-        }
-    }
-    return out_size;
-}
-
-static void zeros(std::ofstream & file, size_t n) {
-    char zero = 0;
-    for (size_t i = 0; i < n; ++i) {
-        file.write(&zero, 1);
-    }
-}
-
-template <typename T>
-struct no_init {
-    T value;
-    no_init() { /* do nothing */ }
-};
-
-void quantize_gguf(const std::string & ifile, const std::string & ofile, struct quantization_params * params) {
-    ggml_context * weight_ctx = NULL;
-    struct gguf_init_params gguf_params = {
-        /*.no_alloc   =*/ false,
-        /*.ctx        =*/ &weight_ctx,
-    };
-    gguf_context * meta_ctx = gguf_init_from_file(ifile.c_str(), gguf_params);
-    std::string arch = "parler-tts"; // only parler-tts gguf files should lack an explicit architecture.
-
-    int arch_key = gguf_find_key(meta_ctx, "general.architecture");
-    if (arch_key != -1) {
-        arch = std::string(gguf_get_val_str(meta_ctx, arch_key));
-    }
-    tts_arch arch_type = SUPPORTED_ARCHITECTURES.at(arch);
-
-    if (params->quantize_type != GGML_TYPE_Q5_0 && params->quantize_type != GGML_TYPE_Q8_0 && params->quantize_type != GGML_TYPE_F16 && params->quantize_type != GGML_TYPE_Q4_0) {
-        fprintf(stdout, "Warning, %s is untested for quantization type '%d'. Use at your own risk.\n", arch.c_str(), params->quantize_type);
-    }
-
-    const size_t align = GGUF_DEFAULT_ALIGNMENT;
-    gguf_context_ptr ctx_out { gguf_init_empty() };
-
-    // copy the KV pairs from the input file
-    gguf_set_kv(ctx_out.get(), meta_ctx);
-    gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION);
-    gguf_set_val_u32(ctx_out.get(), "general.quantization_type", params->quantize_type);
-    for (ggml_tensor * tensor = ggml_get_first_tensor(weight_ctx); tensor; tensor = ggml_get_next_tensor(weight_ctx, tensor)) {
-        std::string name = ggml_get_name(tensor);
-        if (name.size() != 0) {
-            gguf_add_tensor(ctx_out.get(), tensor);
-        }
-    }
-
-    std::vector<no_init<uint8_t>> work;
-
-    std::ofstream fout;
-    auto close_ofstream = [&]() {
-        // Write metadata and close file handler
-        if (fout.is_open()) {
-            fout.seekp(0);
-            std::vector<uint8_t> data(gguf_get_meta_size(ctx_out.get()));
-            gguf_get_meta_data(ctx_out.get(), data.data());
-            fout.write((const char *) data.data(), data.size());
-            fout.close();
-        }
-    };
-    auto new_ofstream = [&]() {
-        std::string fname = ofile;
-        fout = std::ofstream(fname, std::ios::binary);
-        fout.exceptions(std::ofstream::failbit); // fail fast on write errors
-        const size_t meta_size = gguf_get_meta_size(ctx_out.get());
-        // placeholder for the meta data
-        ::zeros(fout, meta_size);
-    };
-    new_ofstream();
-    for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) {
-        enum ggml_type new_type;
-        void * new_data;
-        size_t new_size;
-        std::string name = ggml_get_name(cur);
-        
-        if (name.size() == 0) {
-            continue;
-        }
-
-        if (is_quantizable(arch_type, name, params)) {
-            if ((cur->type) != GGML_TYPE_F32) {
-                TTS_ABORT("ERROR: All quantized tensors must be transformed from 32bit floats. Tensor, '%s', has improper type, '%d'\n", cur->name, cur->type);
-            }
-            new_type = params->quantize_type;
-            if ((new_type >= GGML_TYPE_IQ2_XXS && new_type <= GGML_TYPE_IQ4_XS)) {
-                TTS_ABORT("ERROR: Quantization type '%d' requires an importance matrix.\n", new_type);
-            }
-            const int64_t nelement_size = ggml_nelements(cur) * 4;
-            if (work.size() < (size_t)nelement_size) {
-                work.resize(nelement_size); // upper bound on size
-            }
-            new_data = work.data();
-            new_size = quantize_tensor(new_data, cur, nullptr, new_type, params->n_threads);
-        } else if ((params->convert_non_quantizable_to_f16 && kokoro_is_f16_compatible(name)) || (params->convert_dac_to_f16 && has_prefix(name, "audio_encoder") && !has_suffix(name, "alpha"))) {
-            if ((cur->type) != GGML_TYPE_F32) {
-                TTS_ABORT("ERROR: All converted tensors must be transformed from 32bit floats. Tensor, '%s', has improper type, '%d'\n", cur->name, cur->type);
-            }
-            new_type = GGML_TYPE_F16;
-            const int64_t nelement_size = ggml_nelements(cur) * 4;
-            if (work.size() < (size_t)nelement_size) {
-                work.resize(nelement_size); // upper bound on size
-            }
-            new_data = work.data();
-            new_size = quantize_tensor(new_data, cur, nullptr, new_type, params->n_threads);
-        } else {
-            new_type = cur->type;
-            new_data = cur->data;
-            new_size = ggml_nbytes(cur);
-        }
-
-        gguf_set_tensor_type(ctx_out.get(), name.c_str(), new_type);
-        gguf_set_tensor_data(ctx_out.get(), name.c_str(), new_data, new_size);
-        fprintf(stdout, "At tensor: '%s' with new size: %zu bytes\n", name.c_str(), new_size);
-        // write tensor data + padding
-        fout.write((const char *) new_data, new_size);
-        zeros(fout, GGML_PAD(new_size, align) - new_size);
-    }
-    close_ofstream();
-}
diff --git a/src/tts_model.cpp b/src/tts_model.cpp
index 8fb8412..cb1924e 100644
--- a/src/tts_model.cpp
+++ b/src/tts_model.cpp
@@ -1,18 +1,21 @@
 #include "tts_model.h"
+#include "llama-mmap.h"
+
 #include "ggml-backend.h"
 #include "ggml-cpu.h"
+#include "models/loaders.h"
 
-void append_to_response(struct tts_response * response, struct tts_response * to_append) {
-    float * new_data = (float *) malloc((response->n_outputs + to_append->n_outputs) * sizeof(float));
-    if (response->n_outputs > 0) {
-        std::memcpy(new_data, response->data, response->n_outputs*sizeof(float));
+void append_to_response(tts_response & response, tts_response & to_append) {
+    float * new_data = (float *) malloc((response.n_outputs + to_append.n_outputs) * sizeof(float));
+    if (response.n_outputs > 0) {
+        std::memcpy(new_data, response.data, response.n_outputs*sizeof(float));
     }
-    if (to_append->n_outputs > 0) {
-        float * next_loc = new_data + response->n_outputs;
-        std::memcpy(next_loc, to_append->data, to_append->n_outputs*sizeof(float));
+    if (to_append.n_outputs > 0) {
+        float * next_loc = new_data + response.n_outputs;
+        std::memcpy(next_loc, to_append.data, to_append.n_outputs*sizeof(float));
     }
-    response->data = new_data;
-    response->n_outputs += to_append->n_outputs;
+    response.data = new_data;
+    response.n_outputs += to_append.n_outputs;
 }
 
 /* 
@@ -97,6 +100,18 @@ void tts_runner::free_build() {
     }
 }
 
+tts_generation_runner::tts_generation_runner(const tts_model_loader & loader) : loader{ ref(loader) } {}
+
+tts_generation_runner::~tts_generation_runner() {}
+
+std::vector<std::string_view> tts_generation_runner::list_voices() {
+    GGML_ABORT("The architecture '%s' does not support #list_voices.", loader.get().arch);
+}
+
+void tts_generation_runner::update_conditional_prompt(const char * file_path, const char * prompt) {
+    GGML_ABORT("The architecture '%s' does not support update_conditional_prompt.", loader.get().arch);
+}
+
 void tts_model::prep_buffers_and_context(bool cpu_only, float size_offset, uint32_t dedicated_add_on_size) {
     // currently DAC is only supported on cpu because the ops are not implemented on other devices;
     if (cpu_only) {
diff --git a/src/tts_model.h b/src/tts_model.h
index 93d0a21..0bbd21e 100644
--- a/src/tts_model.h
+++ b/src/tts_model.h
@@ -3,10 +3,13 @@
 
 #include <cstring>
 #include <functional>
+#include <ranges>
 #include "util.h"
 #include "common.h"
 
-void append_to_response(struct tts_response * response, struct tts_response * to_append);
+using namespace std;
+
+void append_to_response(tts_response & response, tts_response & to_append);
 
 using tensor_meta_callback = std::function<void(ggml_tensor*)>*;