diff --git a/README.md b/README.md
index e32bc72..e88242c 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,7 @@ In this endeavor, MacOS and metal support will be treated as the primary platfor
 | [Parler TTS Large](https://huggingface.co/parler-tts/parler-tts-large-v1)|&check;|&check;|&check;|[here](https://huggingface.co/mmwillet2/Parler_TTS_GGUF)|
 | [Kokoro](https://huggingface.co/hexgrad/Kokoro-82M)                      |&check;|&cross;|&check;|[here](https://huggingface.co/mmwillet2/Kokoro_GGUF)    |
 | [Dia](https://github.com/nari-labs/dia)                                  |&check;|&check;|&check;|[here](https://huggingface.co/mmwillet2/Dia_GGUF)       |
+| [Orpheus](https://github.com/canopyai/Orpheus-TTS)                       |&check;|&cross;|&cross;|[here](https://huggingface.co/mmwillet2/Orpheus_GGUF)       |
 
 Additional Model support will initially be added based on open source model performance in both the [old TTS model arena](https://huggingface.co/spaces/TTS-AGI/TTS-Arena) and [new TTS model arena](https://huggingface.co/spaces/TTS-AGI/TTS-Arena-V2) as well as the availability of said models' architectures and checkpoints.
 
diff --git a/examples/cli/README.md b/examples/cli/README.md
index 0fe687f..549c41e 100644
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@@ -11,7 +11,7 @@ This simple example cli tool can be used to generate speach from a text prompt a
 
 In order to get a detailed breakdown the functionality currently available you can call the cli with the `--help` parameter. This will return a breakdown of all parameters:
 ```bash
-./cli --help
+./tts-cli --help
 
 --temperature (-t):
     The temperature to use when generating outputs. Defaults to 1.0.
@@ -52,25 +52,44 @@ In order to get a detailed breakdown the functionality currently available you c
 General usage should follow from these possible parameters. E.G. The following command will save generated speech to the `/tmp/test.wav` file.
 
 ```bash
-./cli --model-path /model/path/to/gguf_file.gguf --prompt "I am saying some words" --save-path /tmp/test.wav
+./tts-cli --model-path /model/path/to/gguf_file.gguf --prompt "I am saying some words" --save-path /tmp/test.wav
 ```
 
-#### Dia Generation Arguments
+#### Dia and Orpheus Generation Arguments
 
-Currently the default cli arguments are not aligned with Dia's default sampling settings. Specifically the temperature and topk settings should be changed to  `1.3` and `35` respectively when generating with Dia like so:
+Currently the default cli arguments are not aligned with Dia's or Orpheus' default sampling settings. Specifically the temperature and topk settings should be changed to  `1.3` and `35` respectively when generating with Dia like so:
 
-```base
-./cli --model-path /model/path/to/Dia.gguf --prompt "[S1] Hi, I am Dia, this is how I talk." --save-path /tmp/test.wav --topk 35 --temperature 1.3
+```bash
+./tts-cli --model-path /model/path/to/Dia.gguf --prompt "[S1] Hi, I am Dia, this is how I talk." --save-path /tmp/test.wav --topk 35 --temperature 1.3
 ```
 
+and the voice, temperature, and repetition penalty setting should be changed to a valid voice (e.g. `leah`), `0.7`, and `1.1` respectively when generating with Orpheus like so:
+
+```bash
+./tts-cli --model-path /model/path/to/Orpheus.gguf --prompt "Hi, I am Orpheus, this is how I talk." --save-path /tmp/test.wav --voice leah --temperature 0.7 --repetition-penalty 1.1
+```
+
+
 #### Conditional Generation
 
+Conditional generation is a Parler TTS specific behavior.
+
 By default the Parler TTS model is saved to the GGUF format with a pre-encoded conditional prompt (i.e. a prompt used to determine how to generate speech), but if the text encoder model, the T5-Encoder model, is avaiable in gguf format (see the [python convertion scripts](../../py-gguf/README.md) for more information on how to prepare the T5-Encoder model) then a new conditional prompt can be used for generation like so:
 
 ```bash
-./cli --model-path /model/path/to/gguf_file.gguf --prompt "I am saying some words" --save-path /tmp/test.wav --text-encoder-path /model/path/to/t5_encoder_file.gguf --consditional-prompt "deep voice"
+./tts-cli --model-path /model/path/to/gguf_file.gguf --prompt "I am saying some words" --save-path /tmp/test.wav --text-encoder-path /model/path/to/t5_encoder_file.gguf --consditional-prompt "deep voice"
 ```
 
+#### Distinct Voice Support
+
+Kokoro and Orpheus both support voices which can be set via the `--voice` (`-v`) argument. Orpheus supports the following voices:
+
+```
+"zoe", "zac","jess", "leo", "mia", "julia", "leah"
+```
+
+and Kokoro supports the voices listedin the section below.
+
 #### MultiLanguage Configuration
 
 Kokoro supports multiple langauges with distinct voices, and, by default, the standard voices are encoded in the Kokoro gguf file. Below is a list of the available voices:
diff --git a/ggml b/ggml
index 1e85c87..136da02 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit 1e85c87aeaa70548ad52766f1881c2f1257962e2
+Subproject commit 136da02ac32d5011cf9b46b117a0ea1be24e2bad
diff --git a/include/common.h b/include/common.h
index 02de8e1..fc0dcdf 100644
--- a/include/common.h
+++ b/include/common.h
@@ -18,12 +18,14 @@ enum tts_arch {
 	PARLER_TTS_ARCH = 0,
 	KOKORO_ARCH = 1,
 	DIA_ARCH = 2,
+	ORPHEUS_ARCH = 3,
 };
 
 const std::map<std::string, tts_arch> SUPPORTED_ARCHITECTURES = {
 	{ "parler-tts", PARLER_TTS_ARCH },
 	{ "kokoro", KOKORO_ARCH },
 	{ "dia", DIA_ARCH },
+	{ "orpheus", ORPHEUS_ARCH }
 };
 
 struct generation_configuration {
diff --git a/include/tts.h b/include/tts.h
index 23c55d0..def032b 100644
--- a/include/tts.h
+++ b/include/tts.h
@@ -4,6 +4,7 @@
 #include "parler_model.h"
 #include "kokoro_model.h"
 #include "dia_model.h"
+#include "orpheus_model.h"
 #include <thread>
 #include <fstream>
 #include <array>
@@ -11,6 +12,7 @@
 struct tts_runner * parler_tts_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only);
 struct tts_runner * kokoro_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only);
 struct tts_runner * dia_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only);
+struct tts_runner * orpheus_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only);
 struct tts_runner * runner_from_file(const std::string & fname, int n_threads, generation_configuration * config, bool cpu_only = true);
 int generate(tts_runner * runner, std::string sentence, struct tts_response * response, generation_configuration * config);
 void update_conditional_prompt(tts_runner * runner, const std::string file_path, const std::string prompt, bool cpu_only = true);
diff --git a/py-gguf/convert_orpheus_to_gguf b/py-gguf/convert_orpheus_to_gguf
new file mode 100644
index 0000000..a2247a0
--- /dev/null
+++ b/py-gguf/convert_orpheus_to_gguf
@@ -0,0 +1,21 @@
+#!/usr/bin/env python3
+
+import argparse
+from tts_encoders.orpheus_gguf_encoder import OrpheusEncoder, DEFAULT_ORPHEUS_REPO_ID, DEFAULT_SNAC_REPO_ID
+from os.path import isdir, dirname
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--save-path", type=str, required=True, help="the path to save the converted gguf tts model too.")
+    parser.add_argument("--repo-id", type=str, required=False, default=DEFAULT_ORPHEUS_REPO_ID, help="The Huggingface repository to pull the model from.")
+    parser.add_argument("--snac-repo-id", type=str, required=False, default=DEFAULT_SNAC_REPO_ID, help="The Huggingface repository to pull the snac audio decoder model from.")
+    parser.add_argument("--never-make-dirs", default=False, action="store_true", help="When set the script will never add new directories.")
+    return parser.parse_known_args()
+
+
+if __name__ == '__main__':
+    args, _ = parse_arguments()
+    if not isdir(dirname(args.save_path)) and args.never_make_dirs:
+        raise ValueError(f"model path, {args.save_path} is not a valid path.")
+    OrpheusEncoder(args.save_path, repo_id=args.repo_id).write()
diff --git a/py-gguf/requirements.txt b/py-gguf/requirements.txt
index 9c8f17e..0f326ad 100644
--- a/py-gguf/requirements.txt
+++ b/py-gguf/requirements.txt
@@ -4,8 +4,8 @@ gguf==0.10.0
 spacy==3.8.5
 kokoro==0.9.4
 huggingface-hub>=0.26.5
-transformers>=4.43.3
-parler_tts @ git+https://github.com/huggingface/parler-tts.git@8e465f1b5fcd223478e07175cb40494d19ffbe17
+transformers>=4.46.0
+parler_tts @ git+https://github.com/huggingface/parler-tts.git@d108732cd57788ec86bc857d99a6cabd66663d68
 gguf==0.10.0
 safetensors==0.5.3
 groovy==0.1.2
@@ -14,5 +14,7 @@ gradio-client==1.10.0
 llvmlite==0.44.0
 numba==0.61.2
 scipy>=1.15.2
+snac==1.2.1
 soundfile>=0.13.1
-nari-tts @ git+https://github.com/nari-labs/dia.git@7cf50c889c6013f74326cbdcb7696a985a4cf9c1
+nari-tts @ git+https://github.com/nari-labs/dia.git@2811af1c5f476b1f49f4744fabf56cf352be21e5
+torchvision==0.21.0
\ No newline at end of file
diff --git a/py-gguf/tts_encoders/__init__.py b/py-gguf/tts_encoders/__init__.py
index 6a03ada..81e5edd 100644
--- a/py-gguf/tts_encoders/__init__.py
+++ b/py-gguf/tts_encoders/__init__.py
@@ -5,3 +5,4 @@
 from .kokoro_gguf_encoder import *
 from .dia_gguf_encoder import *
 from .dac_gguf_encoder import *
+from .orpheus_gguf_encoder import *
diff --git a/py-gguf/tts_encoders/dia_gguf_encoder.py b/py-gguf/tts_encoders/dia_gguf_encoder.py
index dcafee8..dfa9c0b 100644
--- a/py-gguf/tts_encoders/dia_gguf_encoder.py
+++ b/py-gguf/tts_encoders/dia_gguf_encoder.py
@@ -82,7 +82,7 @@ def prepare_decoder_tensors(self):
             elif parts[0] == "norm":
                 self.set_tensor(f"{base}.norm", param)
             elif parts[0] == "logits_dense":
-                heads = param.shape[1];
+                heads = param.shape[1]
                 for i in range(heads):
                     head = param.data[:, i]
                     self.set_tensor(f"{base}.heads.{i}", head.transpose(0,1))
diff --git a/py-gguf/tts_encoders/kokoro_gguf_encoder.py b/py-gguf/tts_encoders/kokoro_gguf_encoder.py
index 54f4db7..ba0685f 100644
--- a/py-gguf/tts_encoders/kokoro_gguf_encoder.py
+++ b/py-gguf/tts_encoders/kokoro_gguf_encoder.py
@@ -96,7 +96,7 @@ class KokoroEncoder(TTSEncoder):
     gguf_encoder.write()
     ```
     """
-    def __init__(self, model_path: Path | str = "./kokoro.gguf", repo_id: Path | str =DEFAULT_KOKORO_REPO,
+    def __init__(self, model_path: Path | str = "./kokoro.gguf", repo_id: Path | str = DEFAULT_KOKORO_REPO,
                  voices: Optional[List[str]] = None, use_espeak: bool = False,
                  phonemizer_repo: Path | str = DEFAULT_TTS_PHONEMIZER_REPO):
         """
diff --git a/py-gguf/tts_encoders/orpheus_gguf_encoder.py b/py-gguf/tts_encoders/orpheus_gguf_encoder.py
new file mode 100644
index 0000000..89b9317
--- /dev/null
+++ b/py-gguf/tts_encoders/orpheus_gguf_encoder.py
@@ -0,0 +1,244 @@
+from huggingface_hub import hf_hub_download
+from pathlib import Path
+from snac import SNAC
+from snac.layers import DecoderBlock
+from transformers import AutoModelForCausalLM
+from transformers.models.llama import LlamaForCausalLM
+from typing import Dict
+from .dac_gguf_encoder import DAC_RESIDUAL_UNIT_PARTS
+from .tts_encoder import TTSEncoder
+from .tensor_util import get_normalized_weight_from_parametrizations
+
+import gguf
+import json
+import math
+import torch
+
+DEFAULT_ORPHEUS_REPO_ID = "canopylabs/orpheus-3b-0.1-ft"
+DEFAULT_SNAC_REPO_ID = "hubertsiuzdak/snac_24khz"
+ORPHEUS_ARCHITECTURE = "orpheus"
+
+
+class OrpheusEncoder(TTSEncoder):
+    """
+    The purpose of this class is to encode and write the tensors and model configuration for the Orpheus TTS model that
+    into a GGUF file.
+
+    General Usage:
+
+    ```python
+    from tts_encoders import OrpheusEncoder
+
+    gguf_encoder = OrpheusEncoder("some/local/path.gguf")
+    gguf_encoder.write()
+    ```
+    """
+    def __init__(self, model_path: Path | str = "./orpheus.gguf", repo_id: Path | str = DEFAULT_ORPHEUS_REPO_ID,
+                 snac_repo_id: Path | str = DEFAULT_SNAC_REPO_ID):
+        """
+        :param Path or str model_path: The path to save the generated GGUF file.
+        :param Path or str repo_id: The path or repository from which to pull the orpheus model and its tokenizer.
+        :param Path or str snac_repo_id: The path or repository from which to pull the SNAC audio decoder.
+        """
+        super().__init__(model_path=model_path, architecture=ORPHEUS_ARCHITECTURE)
+        self._model = None
+        self._snac_model = None
+        self._tokenizer_json = None
+        self._config = None
+        self.repo_id = repo_id
+        self.snac_repo_id = snac_repo_id
+
+    @property
+    def model(self) -> LlamaForCausalLM:
+        if self._model is None:
+            try:
+                self._model = AutoModelForCausalLM.from_pretrained(self.repo_id).eval().to(device="cpu")
+            except Exception as e:
+                self.logger.exception(
+                    f"Failed with exception, {e}, when attempting to obtain Orpheus at path or repo: '{self.repo_id}'"
+                )
+                raise e
+        return self._model
+
+    @property
+    def snac_model(self) -> SNAC:
+        if self._snac_model is None:
+            try:
+                self._snac_model = SNAC.from_pretrained(self.snac_repo_id).eval().to("cpu")
+            except Exception as e:
+                self.logger.exception(
+                    f"Failed with exception, {e}, when attempting to obtain SNAC Model at path or repo: '{self.snac_repo_id}'"
+                )
+                raise e
+        return self._snac_model
+
+    @property
+    def tokenizer_json(self) -> Dict:
+        if self._tokenizer_json is None:
+            try:
+                conf_path = hf_hub_download(repo_id=self.repo_id, filename='tokenizer.json')
+            except Exception as e:
+                self.logger.exception(
+                    f"Failed with exception, {e}, attempting to obtain tokenizer.json via repository '{self.repo_id}'."
+                )
+                raise e
+            with open(conf_path, "r+") as f:
+                self._tokenizer_json = json.load(f)
+        return self._tokenizer_json
+
+    def simplify_snac_name(self, name: str) -> str:
+        parts = name.split(".")
+        model_index = int(parts[0])
+        if model_index == 6:
+            return "alpha_out"
+        elif model_index == 7:
+            return f"final.{parts[1]}"
+        elif model_index == 0:
+            return f"in.{parts[1]}"
+        elif model_index == 1:
+            return f"up.{parts[1]}"
+        else:
+            model_index -= 2
+            layer_index = int(parts[2])
+            if layer_index == 0:
+                return f"layers.{model_index}.alpha"
+            elif layer_index == 1:
+                return f"layers.{model_index}.{parts[-1]}"
+            elif layer_index == 2:
+                return f"layers.{model_index}.noise_{parts[-1]}"
+            else:
+                base = f"layers.{model_index}.residual_unit.{layer_index - 3}"
+                return base + "." + DAC_RESIDUAL_UNIT_PARTS[".".join(parts[-3:])]
+
+    def prepare_tensors(self):
+        self.prepare_orpheus_tensors()
+        self.prepare_snac_tensors()
+        self.prepare_rope_frequencies()
+
+    def prepare_orpheus_tensors(self):
+        for name, param in self.model.model.named_parameters():
+            name = f"orpheus.{name[:-7]}" # all names end in ".weight" for Orpheus
+            self.set_tensor(name, param)
+        self.set_tensor("orpheus.lm_head", self.model.lm_head.weight)
+
+    def prepare_snac_tensors(self):
+        modules = {n: v for n, v in self.snac_model.quantizer.named_modules()}
+        for name, param in self.snac_model.quantizer.named_parameters():
+            if "parametrizations.weight.original0" in name:
+                param = get_normalized_weight_from_parametrizations(modules, name)
+                name = name.replace("parametrizations.weight.original0", "weight")
+            elif "parametrizations.weight" in name:
+                continue
+            self.set_tensor(f"snac.{name}", param)
+
+        modules = {n: v for n, v in self.snac_model.decoder.model.named_modules()}
+        for name, param in self.snac_model.decoder.model.named_parameters():
+            if "parametrizations.weight.original0" in name:
+                param = get_normalized_weight_from_parametrizations(modules, name)
+                name = name.replace("parametrizations.weight.original0", "weight")
+            elif "parametrizations.weight" in name:
+                continue
+            name = self.simplify_snac_name(name)
+            self.set_tensor(f"snac.{name}", param)
+
+    def prepare_rope_frequencies(self):
+        """
+        Because Llama-3 like Rotary Positional Embeddings are not currently supported out-of-the-box in GGML, 
+        we need to encode the rope frequency vectors to use directly.
+        """
+        base = self.model.config.rope_theta
+        dim = self.model.config.head_dim
+        freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
+        factor = self.model.config.rope_scaling.get("factor", 8.0)
+        low_freq_factor = self.model.config.rope_scaling.get("low_freq_factor", 1.0)
+        high_freq_factor = self.model.config.rope_scaling.get("high_freq_factor", 4.0)
+        old_context_len = self.model.config.rope_scaling.get("original_max_position_embeddings", 8192)
+
+        low_freq_wavelen = old_context_len / low_freq_factor
+        high_freq_wavelen = old_context_len / high_freq_factor
+        assert low_freq_wavelen != high_freq_wavelen
+
+        rope_factors = []
+        for freq in freqs:
+            wavelen = 2 * math.pi / freq
+            if wavelen < high_freq_wavelen:
+                rope_factors.append(1)
+            elif wavelen > low_freq_wavelen:
+                rope_factors.append(factor)
+            else:
+                smooth = (old_context_len / wavelen - low_freq_factor) / (
+                        high_freq_factor - low_freq_factor)
+                rope_factors.append(1 / ((1 - smooth) / factor + smooth))
+
+        self.set_tensor("orpheus.rope_frequencies", torch.tensor(rope_factors, dtype=torch.float32))
+
+    def prepare_metadata(self):
+        """
+        Implementation of TTSEncoder's Abstract method see TTSEncoder for more information
+        """
+        total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count()
+        self.metadata = gguf.Metadata.load(None, None, self.repo_id, total_params)
+
+        # Generate parameter weight class (useful for leader boards) if not yet determined
+        if self.metadata.size_label is None and total_params > 0:
+            self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)
+
+        self.set_type()
+        self.set_gguf_parameters()
+        self.metadata.set_gguf_meta_model(self.gguf_writer)
+        self.set_vocab()
+        self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
+
+    def set_gguf_parameters(self):
+        """
+        The purpose of this function is to add general model configuration to the GGUF file writer.
+        """
+
+        # this is not set in Orpheus configuration or on the class level. It is passed as a
+        # a default parameter to the generation function.
+        self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.stopping_token_id", 128258)
+
+        # ---- Orpheus configuration ----
+        self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.hidden_size", self.model.config.hidden_size)
+        self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.vocab_size", self.model.config.vocab_size)
+        self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.attn_heads", self.model.config.num_attention_heads)
+        self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.kv_attn_heads", self.model.config.num_key_value_heads)
+        self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.head_dim", self.model.config.head_dim)
+        self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.layers", self.model.config.num_hidden_layers)
+        self.gguf_writer.add_uint32(
+            f"{self.gguf_writer.arch}.kv_hidden_size",
+            self.model.config.hidden_size // (self.model.config.num_attention_heads // self.model.config.num_key_value_heads)
+        )
+
+        # ---- SNAC configuration ----
+        self.gguf_writer.add_uint32("snac.audio_token_channels", self.snac_model.quantizer.n_codebooks)
+        layer_index = 0
+        for module in self.snac_model.decoder.model:
+            if isinstance(module, DecoderBlock):
+                self.gguf_writer.add_uint32(f"snac.snac_layer_stride_{layer_index}", module.block[1].stride[0])
+                self.gguf_writer.add_uint32(f"snac.snac_layer_padding_{layer_index}", module.block[1].padding[0])
+                self.gguf_writer.add_uint32(f"snac.snac_layer_grouping_{layer_index}", module.block[3].block[1].groups)
+                layer_index += 1
+
+        # The file type setting is purely for describing the primary precision of the model as it is stored in the GGUF file.
+        # This setting *does not* enforce the tensor format or alter tensor processing capabilities in TTS.cpp and is only
+        # used for reporting.
+        self.gguf_writer.add_file_type(gguf.LlamaFileType.ALL_F32)
+
+    def set_type(self):
+        self.gguf_writer.add_type(gguf.GGUFType.MODEL)
+
+    def set_vocab(self):
+        """
+        The purpose of this function is to add the vocab, merges, and configuration for Orpheus' BPE tokenizer
+        to the GGUF file writer.
+        """
+        assert "model" in self.tokenizer_json and "type" in self.tokenizer_json["model"] and self.tokenizer_json["model"]["type"] == "BPE" \
+               and "merges" in self.tokenizer_json["model"] and "vocab" in self.tokenizer_json["model"]
+        tokens = list(self.tokenizer_json["model"]["vocab"].keys())
+        print(f"HERE WTF is going on {len(tokens)}.")
+        merges = [" ".join(pair) for pair in self.tokenizer_json["model"]["merges"]]
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_merges(merges)
+        self.gguf_writer.add_eos_token_id(self.model.config.eos_token_id)
+        self.gguf_writer.add_bos_token_id(self.model.config.bos_token_id)
diff --git a/py-gguf/tts_encoders/tensor_util.py b/py-gguf/tts_encoders/tensor_util.py
index f546ddd..e5b8773 100644
--- a/py-gguf/tts_encoders/tensor_util.py
+++ b/py-gguf/tts_encoders/tensor_util.py
@@ -1,4 +1,6 @@
+import torch
 import torch.nn as nn
+from torch.nn.utils.parametrizations import _WeightNorm
 from torch.nn.utils.weight_norm import WeightNorm
 from typing import Dict
 
@@ -23,3 +25,26 @@ def get_regularized_weight(modules: Dict[str, nn.Module], parameter_name: str) -
             hook(module, None)
             break
     return module.weight
+
+
+def get_normalized_weight_from_parametrizations(modules: Dict[str, nn.Module], parameter_name: str) -> torch.Tensor:
+    """
+    Attempts to call the default parametrization forward pass for weight normalization such that the true weight
+    can be determined via the stored parametrized variables.
+
+    :param Dict[str, nn.Module] modules: a dictionary containing modules belonging to the current module context by name
+    :param str parameter_name: the base parameter name from which the normalized weight is to be derived.
+    :return torch.Tensor: the computed normalized weight tensor.
+    """
+    parent_module_name = parameter_name.split(".parametrizations")[0]
+    if parent_module_name not in modules:
+        raise KeyError(f"Failed to find module, {parent_module_name}, for parameter, {parameter_name}, in modules dictionary.")
+    module = modules[parent_module_name]
+    if "weight" not in module.parametrizations:
+        raise KeyError(f"Failed to find parameterized weight on module, {parent_module_name}, for parameter, {parameter_name}.")
+    assert isinstance(module.parametrizations["weight"][0], _WeightNorm)
+    return torch._weight_norm(
+        module.parametrizations["weight"].original1,
+        module.parametrizations["weight"].original0,
+        module.parametrizations["weight"][0].dim
+    )
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6244815..3d07940 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -23,6 +23,9 @@ add_library(tts
             tts_model.cpp
             kokoro_model.cpp
             dia_model.cpp
+            orpheus_model.cpp
+            snac_model.cpp
+            general_neural_audio_codec.cpp
             )
 
 target_include_directories(tts PUBLIC . ../include ../ggml/src/)
diff --git a/src/dac_model.cpp b/src/dac_model.cpp
index d53b7ff..6685007 100644
--- a/src/dac_model.cpp
+++ b/src/dac_model.cpp
@@ -9,26 +9,12 @@ static const std::map<std::string, dac_tensor> DAC_TENSOR_GGUF_LOOKUP = {
     {"final.bias", DAC_ENCODER_OUT_BIAS},
     {"final.weight", DAC_ENCODER_OUT_KERNEL},
     {"final.alpha", DAC_ENCODER_SNAKE_ALPHA},
-    {".final.alpha", DAC_ENCODER_LAYER_SNAKE_ALPHA},
-    {".final.bias", DAC_ENCODER_LAYER_OUT_BIAS},
-    {".final.weight", DAC_ENCODER_LAYER_OUT_KERNEL},
-    {".res.initial.alpha", DAC_ENCODER_LAYER_RES_BLK_IN_SNAKE},
-    {".res.initial.bias", DAC_ENCODER_LAYER_RES_BLK_IN_BIAS},
-    {".res.initial.weight", DAC_ENCODER_LAYER_RES_BLK_IN_KERNEL},
-    {".res.final.alpha", DAC_ENCODER_LAYER_RES_BLK_OUT_SNAKE},
-    {".res.final.bias", DAC_ENCODER_LAYER_RES_BLK_OUT_BIAS},
-    {".res.final.weight", DAC_ENCODER_LAYER_RES_BLK_OUT_KERNEL},
-    {".in_proj.bias", DAC_QUANTIZER_LAYER_IN_BIAS},
-    {".in_proj.weight", DAC_QUANTIZER_LAYER_IN_KERNEL},
-    {".out_proj.bias", DAC_QUANTIZER_LAYER_OUT_BIAS},
-    {".out_proj.weight", DAC_QUANTIZER_LAYER_OUT_KERNEL},
-    {".codebook.weight", DAC_QUANTIZER_LAYER_CODEBOOK},
 };
 
 void dac_model::prep_constants(gguf_context * meta) {
     int output_heads_key = search_for_gguf_keys(meta, {"parler-tts.decoder.output_heads", "output_heads", "dia.decoder.output_heads"});
     if (output_heads_key != -1) {
-        n_heads = gguf_get_val_u32(meta, output_heads_key);;
+        n_heads = gguf_get_val_u32(meta, output_heads_key);
     }
 
     int sampling_factor_key = search_for_gguf_keys(meta, {"dac.up_sampling_factor", "up_sampling_factor"});
@@ -40,37 +26,30 @@ void dac_model::prep_constants(gguf_context * meta) {
     if (max_gen_key != -1) {
         max_generation_size = gguf_get_val_u32(meta, max_gen_key);
     }
-    
-    for (int i = 0; i < (int) layers.size(); i++)  {
-        std::string stride_kw = "dac_layer_stride_" + std::to_string(i);
-        std::string padding_kw = "dac_layer_padding_" + std::to_string(i);
-        int layer_stride_key = search_for_gguf_keys(meta, {"dac." + stride_kw, stride_kw});
-        if (layer_stride_key == -1) {
-            TTS_ABORT("key %s must be specified in gguf file.", ("dac." + stride_kw).c_str());
-        }
-        layers[i].stride = gguf_get_val_u32(meta, layer_stride_key);
-        int layer_padding_key = search_for_gguf_keys(meta, {"dac." + padding_kw, padding_kw});
-        if (layer_padding_key == -1) {
-            TTS_ABORT("key %s must be specified in gguf file.", ("dac." + padding_kw).c_str());
-        }
-        layers[i].padding = gguf_get_val_u32(meta, layer_padding_key);
-    }
 }
 
 void dac_model::prep_layers(gguf_context * meta) {
     for (int i = 0; i < n_heads; i++) {
-        dac_quantize_layer l;
-        quantizer_layers.push_back(l);
+        quantizer_layers.push_back(general_neural_audio_codec::residual_vector_quantize_layer{});
     }
     
     for (int i = 0; i < n_layers; i++) {
-        dac_layer l;
-        // all dac layers have 3 residual units
-        for (int ii = 0; ii < 3; ii++) {
-            dac_residual_unit u;
-            l.residual_blocks.push_back(u);
+        std::string stride_key = "dac_layer_stride_" + std::to_string(i);
+        std::string padding_key = "dac_layer_padding_" + std::to_string(i);
+        int layer_stride_key = search_for_gguf_keys(meta, {"dac." + stride_key, stride_key});
+        if (layer_stride_key == -1) {
+            TTS_ABORT("key %s must be specified in gguf file inorder to initialize the DAC audio decoder.", stride_key.c_str());
+        }        
+        int layer_padding_key = search_for_gguf_keys(meta, {"dac." + padding_key, padding_key});
+        if (layer_padding_key == -1) {
+            TTS_ABORT("key %s must be specified in gguf file inorder to initialize the DAC audio decoder.", padding_key.c_str());
         }
-        layers.push_back(l);
+        layers.push_back(
+            general_neural_audio_codec::layer{
+                gguf_get_val_u32(meta, layer_padding_key),
+                gguf_get_val_u32(meta, layer_stride_key),
+            }
+        );
     }
 }
 
@@ -78,95 +57,6 @@ void dac_model::assign_weight(std::string name, ggml_tensor * tensor) {
     assign_to_audio_encoder(this, name, tensor);
 }
 
-void assign_residual_unit(dac_model * model, dac_residual_unit * l, std::string name, ggml_tensor * tensor) {
-    try {
-        dac_tensor tensor_type = DAC_TENSOR_GGUF_LOOKUP.at(name);
-        switch (tensor_type) {
-            case DAC_ENCODER_LAYER_RES_BLK_IN_SNAKE:
-                l->in_snake_alpha = ggml_dup_tensor(model->ctx, tensor);
-                model->set_tensor(l->in_snake_alpha, tensor);
-                break;
-            case DAC_ENCODER_LAYER_RES_BLK_OUT_SNAKE:
-                l->out_snake_alpha = ggml_dup_tensor(model->ctx, tensor);
-                model->set_tensor(l->out_snake_alpha, tensor);
-                break;
-            case DAC_ENCODER_LAYER_RES_BLK_IN_KERNEL:
-                l->in_conv_kernel = ggml_dup_tensor(model->ctx, tensor);
-                model->set_tensor(l->in_conv_kernel, tensor);
-                break;
-            case DAC_ENCODER_LAYER_RES_BLK_OUT_KERNEL:
-                l->out_conv_kernel = ggml_dup_tensor(model->ctx, tensor);
-                model->set_tensor(l->out_conv_kernel, tensor);
-                break;
-            case DAC_ENCODER_LAYER_RES_BLK_IN_BIAS:
-                l->in_conv_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor));
-                model->set_tensor(l->in_conv_bias, tensor);
-                break;
-            case DAC_ENCODER_LAYER_RES_BLK_OUT_BIAS:
-                l->out_conv_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor));
-                model->set_tensor(l->out_conv_bias, tensor);
-                break;
-            default:
-                fprintf(stdout, "residual unit unassigned tensor %s\n", name.c_str());
-                break;
-        }
-    } catch (const std::out_of_range& e) {
-        TTS_ABORT("Error: %s\nTensor, '%s', is not a valid tensor.", e.what(), name.c_str());
-    }
-
-}
-
-void assign_dac_layer(dac_model * model, dac_layer * layer, std::string name, ggml_tensor * tensor) {
-    if (DAC_TENSOR_GGUF_LOOKUP.find(name) != DAC_TENSOR_GGUF_LOOKUP.end()) {
-        switch(DAC_TENSOR_GGUF_LOOKUP.at(name)) {
-            case DAC_ENCODER_LAYER_SNAKE_ALPHA:
-                layer->snake_alpha_in = ggml_dup_tensor(model->ctx, tensor);
-                model->set_tensor(layer->snake_alpha_in, tensor);
-                break;
-            case DAC_ENCODER_LAYER_OUT_KERNEL:
-                layer->out_conv_kernel = ggml_dup_tensor(model->ctx, tensor);
-                model->set_tensor(layer->out_conv_kernel, tensor);
-                break;
-            case DAC_ENCODER_LAYER_OUT_BIAS:
-                layer->out_conv_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor));
-                model->set_tensor(layer->out_conv_bias, tensor);
-                break;
-            default:
-                fprintf(stdout, "layer unassigned tensor %s\n", name.c_str());
-                break;
-        }
-    } else if (std::find_if(name.begin(), name.end(), ::isdigit) != name.end())  {
-        auto pair = parse_layer_count(name);
-        int l = pair.first;
-        std::string lt_name = pair.second;
-        assign_residual_unit(model, &layer->residual_blocks[l], lt_name, tensor);
-    }
-}
-
-void assign_quantizer_layer(dac_model * model, dac_quantize_layer * layer, std::string name, ggml_tensor * tensor) {
-    try {
-        switch(DAC_TENSOR_GGUF_LOOKUP.at(name)) {
-            case DAC_QUANTIZER_LAYER_OUT_KERNEL:
-                layer->out_proj_kernel = ggml_dup_tensor(model->ctx, tensor);
-                model->set_tensor(layer->out_proj_kernel, tensor);
-                break;
-            case DAC_QUANTIZER_LAYER_OUT_BIAS:
-                layer->out_proj_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor));
-                model->set_tensor(layer->out_proj_bias, tensor);
-                break;
-            case DAC_QUANTIZER_LAYER_CODEBOOK:
-                layer->codebook = ggml_dup_tensor(model->ctx, tensor);
-                model->set_tensor(layer->codebook, tensor);
-                break;
-            default:
-                fprintf(stdout, "quantized layer unassigned tensor %s\n", name.c_str());
-                break;
-        }
-    }  catch (const std::out_of_range& e) {
-        TTS_ABORT("Error: %s\nTensor, '%s', is not a valid tensor.", e.what(), name.c_str());
-    }
-}
-
 void assign_to_audio_encoder(dac_model * model, std::string name, ggml_tensor * tensor) {
     if (DAC_TENSOR_GGUF_LOOKUP.find(name) != DAC_TENSOR_GGUF_LOOKUP.end()) {
         switch(DAC_TENSOR_GGUF_LOOKUP.at(name)) {
@@ -199,14 +89,14 @@ void assign_to_audio_encoder(dac_model * model, std::string name, ggml_tensor *
         int l = pair.first;
         std::string lt_name = pair.second;
         if (name.find("quantizers") != std::string::npos) {
-            assign_quantizer_layer(model, &model->quantizer_layers[l], lt_name, tensor);
+            general_neural_audio_codec::assign_to_quantize_layer((tts_model *) model, model->quantizer_layers[l], lt_name, tensor);
         } else {
-            assign_dac_layer(model, &model->layers[l - 1], lt_name, tensor);
+            general_neural_audio_codec::assign_to_layer((tts_model *) model, model->layers[l - 1], lt_name, tensor);
         }
     }
 }
 
-static struct ggml_tensor * dac_build_audio_inputs(struct ggml_context * ctx, struct dac_context * dctx, const dac_ubatch & batch, std::vector<dac_quantize_layer> layers) {
+static struct ggml_tensor * dac_build_audio_inputs(struct ggml_context * ctx, struct dac_context * dctx, const dac_ubatch & batch, std::vector<general_neural_audio_codec::residual_vector_quantize_layer> layers) {
     struct ggml_tensor * embd;
     
     dctx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.sequence_length*dctx->model->n_heads);
@@ -220,10 +110,7 @@ static struct ggml_tensor * dac_build_audio_inputs(struct ggml_context * ctx, st
         auto quantize_layer = dctx->model->quantizer_layers[i];
         struct ggml_tensor * code = ggml_cont(ctx, ggml_view_2d(ctx, dctx->inp_tokens, 1, batch.sequence_length, dctx->model->n_heads*ggml_type_size(GGML_TYPE_I32), i*ggml_type_size(GGML_TYPE_I32)));
         code = ggml_reshape_1d(ctx, code, batch.sequence_length);
-        code = ggml_get_rows(ctx, quantize_layer.codebook, code);
-        code = ggml_cont(ctx, ggml_transpose(ctx, code));
-        code = ggml_conv_1d(ctx, quantize_layer.out_proj_kernel, code, 1, 0, 1);
-        code = ggml_add(ctx, code, quantize_layer.out_proj_bias);
+        code = general_neural_audio_codec::build_quantize_layer(ctx, code, quantize_layer);
 
         if (i == 0) {
             embd = code;
@@ -234,27 +121,6 @@ static struct ggml_tensor * dac_build_audio_inputs(struct ggml_context * ctx, st
     return embd;
 }
 
-static struct ggml_tensor * build_residual_unit(ggml_context * ctx, struct ggml_tensor * cur, dac_residual_unit & u, int padding, int dilation) {
-    struct ggml_tensor * residual = cur;
-    cur = snake_1d(ctx, u.in_snake_alpha, cur);
-    cur = ggml_conv_1d(ctx, u.in_conv_kernel, cur, 1, padding, dilation);
-    cur = ggml_add(ctx, cur, u.in_conv_bias);
-    cur = snake_1d(ctx, u.out_snake_alpha, cur);
-    cur = ggml_conv_1d(ctx, u.out_conv_kernel,  cur, 1, 0, 1);
-    cur = ggml_add(ctx, cur, u.out_conv_bias);
-    return ggml_add(ctx, cur, residual);
-}
-
-static struct ggml_tensor * build_decoder_block(ggml_context * ctx, struct ggml_tensor * cur, dac_layer & l, struct dac_context * dctx) {
-    cur = snake_1d(ctx, l.snake_alpha_in, cur);
-    cur = ggml_conv_transpose_1d(ctx, l.out_conv_kernel, cur, l.stride, l.padding, 1, 0, 1);
-    cur = ggml_add(ctx, cur, l.out_conv_bias);
-    for (int i = 0; i < l.residual_blocks.size(); i++) {
-        cur = build_residual_unit(ctx, cur, l.residual_blocks[i], pow(3, (i + 1)), pow(3, i));
-    }
-    return cur;
-}
-
 struct dac_context * build_new_dac_context(struct dac_model * model, int n_threads, bool use_cpu) {
     dac_context * dctx = new dac_context(model, n_threads);
     if (!use_cpu) {
@@ -291,7 +157,7 @@ struct ggml_cgraph * dac_runner::build_dac_graph(dac_ubatch & batch) {
     cur = ggml_conv_1d(ctx, model->in_conv_kernel, inputs, 1, 3, 1);
     cur = ggml_add(ctx, cur, model->in_conv_bias);
     for (auto l : model->layers) {
-        cur = build_decoder_block(ctx, cur, l, dctx);
+        cur = general_neural_audio_codec::build_layer(ctx, cur, l);
     }
     cur = snake_1d(ctx, model->snake_alpha, cur);
     cur = ggml_conv_1d(ctx, model->out_conv_kernel, cur, 1, 3, 1);
diff --git a/src/dac_model.h b/src/dac_model.h
index 6befa32..be43ad0 100644
--- a/src/dac_model.h
+++ b/src/dac_model.h
@@ -1,7 +1,7 @@
 #ifndef dac_model_h
 #define dac_model_h
 
-#include "tts_model.h"
+#include "general_neural_audio_codec.h"
 #include <map>
 
 enum dac_tensor {
@@ -10,40 +10,6 @@ enum dac_tensor {
     DAC_ENCODER_OUT_KERNEL,
     DAC_ENCODER_OUT_BIAS,
     DAC_ENCODER_SNAKE_ALPHA,
-    DAC_ENCODER_LAYER_SNAKE_ALPHA,
-    DAC_ENCODER_LAYER_OUT_KERNEL,
-    DAC_ENCODER_LAYER_OUT_BIAS,
-    DAC_ENCODER_LAYER_RES_BLK_IN_SNAKE,
-    DAC_ENCODER_LAYER_RES_BLK_OUT_SNAKE,
-    DAC_ENCODER_LAYER_RES_BLK_IN_KERNEL,
-    DAC_ENCODER_LAYER_RES_BLK_OUT_KERNEL,
-    DAC_ENCODER_LAYER_RES_BLK_IN_BIAS,
-    DAC_ENCODER_LAYER_RES_BLK_OUT_BIAS,
-    DAC_QUANTIZER_LAYER_IN_KERNEL,
-    DAC_QUANTIZER_LAYER_IN_BIAS,
-    DAC_QUANTIZER_LAYER_OUT_KERNEL,
-    DAC_QUANTIZER_LAYER_OUT_BIAS,
-    DAC_QUANTIZER_LAYER_CODEBOOK
-};
-
-struct dac_residual_unit {
-    struct ggml_tensor * in_snake_alpha;
-    struct ggml_tensor * in_conv_kernel;
-    struct ggml_tensor * in_conv_bias;
-    struct ggml_tensor * out_snake_alpha;
-    struct ggml_tensor * out_conv_kernel;
-    struct ggml_tensor * out_conv_bias;
-};
-
-struct dac_layer {
-    struct ggml_tensor * snake_alpha_in;
-    struct ggml_tensor * out_conv_kernel;
-    struct ggml_tensor * out_conv_bias;
-
-    uint32_t padding;
-    uint32_t stride;
-    
-    std::vector<dac_residual_unit> residual_blocks;
 };
 
 struct dac_quantize_layer {
@@ -52,6 +18,7 @@ struct dac_quantize_layer {
     struct ggml_tensor * codebook;
 };
 
+// DAC, Descript Audio Codec, is a channel token to audio autoencoder model (though we only use its decoder functionality).
 // this struct maintains the static tensors for the dac audio decoder graph.
 // As such, this is designed to contain basic configuration and ggml tensor support for DAC.
 // The dac_runner describes how the graph is built and run.
@@ -67,8 +34,8 @@ struct dac_model : tts_model {
     struct ggml_tensor * out_conv_kernel;
     struct ggml_tensor * out_conv_bias;
     struct ggml_tensor * snake_alpha;
-    std::vector<dac_layer> layers;
-    std::vector<dac_quantize_layer> quantizer_layers;
+    std::vector<general_neural_audio_codec::layer> layers;
+    std::vector<general_neural_audio_codec::residual_vector_quantize_layer> quantizer_layers;
 
     void assign_weight(std::string name, ggml_tensor * weight);
     void prep_constants(gguf_context * meta);
@@ -81,9 +48,6 @@ struct dac_model : tts_model {
 };
 
 // for loading DAC model from gguf file
-void assign_residual_unit(dac_model * model, dac_residual_unit * layer, std::string name, ggml_tensor * tensor);
-void assign_dac_layer(dac_model * model, dac_layer * layer, std::string name, ggml_tensor * tensor);
-void assign_quantizer_layer(dac_model * model, dac_quantize_layer  layer, std::string name, ggml_tensor * tensor);
 void assign_to_audio_encoder(dac_model * model, std::string name, ggml_tensor * tensor);
 
 // the context used for running the dac model
@@ -91,10 +55,7 @@ struct dac_context : runner_context {
     dac_context(dac_model * model, int n_threads): runner_context(n_threads), model(model) {};
     
     struct dac_model * model;
-    
-    size_t  logits_size = 0; // capacity (of floats) for logits
-    float * logits      = nullptr;
-    
+        
     struct ggml_tensor * inp_tokens;
     
     void build_schedule() {
@@ -109,9 +70,7 @@ struct dac_ubatch {
     uint32_t sequence_length;
 };
 
-static struct ggml_tensor * dac_build_audio_inputs(struct ggml_context * ctx, struct dac_context * dctx, const dac_ubatch & batch, std::vector<dac_quantize_layer> layers);
-static struct ggml_tensor * build_residual_unit(ggml_context * ctx, struct ggml_tensor * cur, dac_residual_unit & u, int padding, int dilation);
-static struct ggml_tensor * build_decoder_block(ggml_context * ctx, struct ggml_tensor * cur, dac_layer & l, struct dac_context * dctx);
+static struct ggml_tensor * dac_build_audio_inputs(struct ggml_context * ctx, struct dac_context * dctx, const dac_ubatch & batch, std::vector<general_neural_audio_codec::residual_vector_quantize_layer> layers);
 
 // This struct is intended to manage the dac model's graph compilation and compute function.
 struct dac_runner : tts_runner {
diff --git a/src/dia_model.h b/src/dia_model.h
index 69ba6f6..bdca91d 100644
--- a/src/dia_model.h
+++ b/src/dia_model.h
@@ -99,7 +99,6 @@ struct dia_context : runner_context {
     uint32_t current_position = 0;  // current position in the active sequence
     int delay_steps           = -1; // the max remaining steps to take before terminating; is set after an eos token is seen on the first output channel
     size_t prompt_size        = 0;
-    float * logits            = nullptr;
 
     uint32_t max_generation_size; // this is set by the generation context or defaults to the config set on dia model.
 
diff --git a/src/general_neural_audio_codec.cpp b/src/general_neural_audio_codec.cpp
new file mode 100644
index 0000000..8f7893e
--- /dev/null
+++ b/src/general_neural_audio_codec.cpp
@@ -0,0 +1,172 @@
+#include "general_neural_audio_codec.h"
+#include <algorithm>
+#include <stdexcept>
+#include <map>
+
+namespace general_neural_audio_codec {
+    // This contains a mapping between string names and gguf_tensor enum values for the purposes of assigning the weights from a gguf file
+    // to the general_neural_audio_codec::layer.
+    // Please note that some gguf_tensor values have multiple keys; this is to support backwards compatibility with original DAC settings.
+    static const std::map<std::string, gguf_tensor> GENERAL_NEURAL_AUDIO_CODEC_TENSOR_LOOKUP = {
+        {".final.alpha", LAYER_ALPHA},
+        {".final.bias", LAYER_INPUT_BIAS},
+        {".final.weight", LAYER_INPUT_KERNEL},
+        {".alpha", LAYER_ALPHA},
+        {".bias", LAYER_INPUT_BIAS},
+        {".weight", LAYER_INPUT_KERNEL},
+        {".noise_weight", LAYER_NOISE_KERNEL},
+        {".res.initial.alpha", RESIDUAL_UNIT_INPUT_ALPHA},
+        {".res.initial.bias", RESIDUAL_UNIT_INPUT_BIAS},
+        {".res.initial.weight", RESIDUAL_UNIT_INPUT_KERNEL},
+        {".res.final.alpha", RESIDUAL_UNIT_OUTPUT_ALPHA},
+        {".res.final.bias", RESIDUAL_UNIT_OUTPUT_BIAS},
+        {".res.final.weight", RESIDUAL_UNIT_OUTPUT_KERNEL},
+        {".in_alpha", RESIDUAL_UNIT_INPUT_ALPHA},
+        {".in_bias", RESIDUAL_UNIT_INPUT_BIAS},
+        {".in_weight", RESIDUAL_UNIT_INPUT_KERNEL},
+        {".out_alpha", RESIDUAL_UNIT_OUTPUT_ALPHA},
+        {".out_bias", RESIDUAL_UNIT_OUTPUT_BIAS},
+        {".out_weight", RESIDUAL_UNIT_OUTPUT_KERNEL},
+        {".out_proj.bias", QUANTIZER_LAYER_OUT_BIAS},
+        {".out_proj.weight", QUANTIZER_LAYER_OUT_KERNEL},
+        {".codebook.weight", QUANTIZER_LAYER_CODEBOOK},
+    };
+
+    void assign_to_residual_unit(tts_model * model, residual_unit & unit, std::string name, struct ggml_tensor * tensor) {
+        try {
+            gguf_tensor tensor_type = GENERAL_NEURAL_AUDIO_CODEC_TENSOR_LOOKUP.at(name);
+            switch (tensor_type) {
+                case RESIDUAL_UNIT_INPUT_ALPHA:
+                    unit.in_alpha = ggml_dup_tensor(model->ctx, tensor);
+                    model->set_tensor(unit.in_alpha, tensor);
+                    break;
+                case RESIDUAL_UNIT_OUTPUT_ALPHA:
+                    unit.out_alpha = ggml_dup_tensor(model->ctx, tensor);
+                    model->set_tensor(unit.out_alpha, tensor);
+                    break;
+                case RESIDUAL_UNIT_INPUT_KERNEL:
+                    unit.in_conv_kernel = ggml_dup_tensor(model->ctx, tensor);
+                    model->set_tensor(unit.in_conv_kernel, tensor);
+                    break;
+                case RESIDUAL_UNIT_OUTPUT_KERNEL:
+                    unit.out_conv_kernel = ggml_dup_tensor(model->ctx, tensor);
+                    model->set_tensor(unit.out_conv_kernel, tensor);
+                    break;
+                case RESIDUAL_UNIT_INPUT_BIAS:
+                    unit.in_conv_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor));
+                    model->set_tensor(unit.in_conv_bias, tensor);
+                    break;
+                case RESIDUAL_UNIT_OUTPUT_BIAS:
+                    unit.out_conv_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor));
+                    model->set_tensor(unit.out_conv_bias, tensor);
+                    break;
+                default:
+                    fprintf(stdout, "residual unit unassigned tensor %s\n", name.c_str());
+                    break;
+            }
+        } catch (const std::out_of_range& e) {
+            TTS_ABORT("Tensor, '%s', is not a valid tensor general_neural_audio_codec::residual_unit tensor.", name.c_str());
+        }
+    }
+
+    void assign_to_layer(tts_model * model, layer & l, std::string name, struct ggml_tensor * tensor) {
+        if (GENERAL_NEURAL_AUDIO_CODEC_TENSOR_LOOKUP.find(name) != GENERAL_NEURAL_AUDIO_CODEC_TENSOR_LOOKUP.end()) {
+            switch(GENERAL_NEURAL_AUDIO_CODEC_TENSOR_LOOKUP.at(name)) {
+                case LAYER_ALPHA:
+                    l.in_alpha = ggml_dup_tensor(model->ctx, tensor);
+                    model->set_tensor(l.in_alpha, tensor);
+                    break;
+                case LAYER_INPUT_KERNEL:
+                    l.in_conv_kernel = ggml_dup_tensor(model->ctx, tensor);
+                    model->set_tensor(l.in_conv_kernel, tensor);
+                    break;
+                case LAYER_INPUT_BIAS:
+                    l.in_conv_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor));
+                    model->set_tensor(l.in_conv_bias, tensor);
+                    break;
+                case LAYER_NOISE_KERNEL:
+                    l.noise_conv_kernel = ggml_dup_tensor(model->ctx, tensor);
+                    model->set_tensor(l.noise_conv_kernel, tensor);
+                    break;
+                default:
+                    fprintf(stdout, "layer unassigned tensor %s\n", name.c_str());
+                    break;
+            }
+        } else if (std::find_if(name.begin(), name.end(), ::isdigit) != name.end())  {
+            auto pair = parse_layer_count(name);
+            int i = pair.first;
+            std::string lt_name = pair.second;
+            assign_to_residual_unit(model, l.residual_blocks[i], lt_name, tensor);
+        } else {
+            TTS_ABORT("Tensor, '%s', is not a valid tensor general_neural_audio_codec::layer tensor.", name.c_str());
+        }
+    }
+
+    void assign_to_quantize_layer(tts_model * model, residual_vector_quantize_layer & l, std::string name, struct ggml_tensor * tensor) {
+        try {
+            switch(GENERAL_NEURAL_AUDIO_CODEC_TENSOR_LOOKUP.at(name)) {
+                case QUANTIZER_LAYER_OUT_KERNEL:
+                    l.out_proj_kernel = ggml_dup_tensor(model->ctx, tensor);
+                    model->set_tensor(l.out_proj_kernel, tensor);
+                    break;
+                case QUANTIZER_LAYER_OUT_BIAS:
+                    l.out_proj_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor));
+                    model->set_tensor(l.out_proj_bias, tensor);
+                    break;
+                case QUANTIZER_LAYER_CODEBOOK:
+                    l.codebook = ggml_dup_tensor(model->ctx, tensor);
+                    model->set_tensor(l.codebook, tensor);
+                    break;
+                default:
+                    fprintf(stdout, "quantized layer unassigned tensor %s\n", name.c_str());
+                    break;
+            }
+        } catch (const std::out_of_range& e) {
+            // older GGUF files still have the unused in_proj convolutional layer, so ignore it if we find it.
+            if (!has_prefix(name, ".in_proj")) {
+                TTS_ABORT("Error: %s\nTensor, '%s', is not a valid tensor.", e.what(), name.c_str());
+            }
+        }
+    }
+
+    struct ggml_tensor * build_residual_unit(ggml_context * ctx, struct ggml_tensor * cur, residual_unit & unit) {
+        struct ggml_tensor * residual = cur;
+        cur = snake_1d(ctx, unit.in_alpha, cur);
+        if (unit.groups > 1) {
+            // depthwise 1d convolution is equivalent to convolution in which grouping is equal to filter size.
+            // If there is a divergence between filter size and grouping then the kernel's output filters will not be zero.
+            TTS_ASSERT(unit.in_conv_kernel->ne[1] == 1); 
+            cur = ggml_conv_1d_dw(ctx, unit.in_conv_kernel, cur, 1, unit.padding, unit.dilation);
+        } else {
+            cur = ggml_conv_1d(ctx, unit.in_conv_kernel, cur, 1, unit.padding, unit.dilation);
+        }
+        cur = ggml_add(ctx, cur, unit.in_conv_bias);
+        cur = snake_1d(ctx, unit.out_alpha, cur);
+        cur = ggml_conv_1d(ctx, unit.out_conv_kernel, cur, 1, 0, 1);
+        cur = ggml_add(ctx, cur, unit.out_conv_bias);
+        return ggml_add(ctx, cur, residual);
+    }
+
+    struct ggml_tensor * build_layer(ggml_context * ctx, struct ggml_tensor * cur, layer & l, struct ggml_tensor * noise) {
+        cur = snake_1d(ctx, l.in_alpha, cur);
+        cur = ggml_conv_transpose_1d(ctx, l.in_conv_kernel, cur, l.stride, l.padding, 1, 0, 1);
+        cur = ggml_add(ctx, cur, l.in_conv_bias);
+        if (l.noise_conv_kernel && noise) {
+            struct ggml_tensor * x = ggml_conv_1d(ctx, l.noise_conv_kernel, cur, 1, 0, 1);
+            x = ggml_mul(ctx, x, noise);
+            cur = ggml_add(ctx, cur, x);
+        }
+        for (int i = 0; i < l.residual_blocks.size(); i++) {
+           cur = build_residual_unit(ctx, cur, l.residual_blocks[i]);
+        }
+        return cur;
+    }
+
+    struct ggml_tensor * build_quantize_layer(ggml_context * ctx, struct ggml_tensor * cur, residual_vector_quantize_layer & l) {
+        cur = ggml_get_rows(ctx, l.codebook, cur);
+        cur = ggml_cont(ctx, ggml_transpose(ctx, cur));
+        cur = ggml_conv_1d(ctx, l.out_proj_kernel, cur, 1, 0, 1);
+        cur = ggml_add(ctx, cur, l.out_proj_bias);
+        return cur;
+    }
+}
diff --git a/src/general_neural_audio_codec.h b/src/general_neural_audio_codec.h
new file mode 100644
index 0000000..1ec0a42
--- /dev/null
+++ b/src/general_neural_audio_codec.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include "tts_model.h"
+
+// This namespace implements a general abstraction of the core functionality used in common neural audio codecs like DAC and SNAC.
+namespace general_neural_audio_codec {
+    enum gguf_tensor {
+        LAYER_ALPHA,
+        LAYER_INPUT_KERNEL,
+        LAYER_INPUT_BIAS,
+        LAYER_NOISE_KERNEL,
+        RESIDUAL_UNIT_INPUT_ALPHA,
+        RESIDUAL_UNIT_OUTPUT_ALPHA,
+        RESIDUAL_UNIT_INPUT_KERNEL,
+        RESIDUAL_UNIT_OUTPUT_KERNEL,
+        RESIDUAL_UNIT_INPUT_BIAS,
+        RESIDUAL_UNIT_OUTPUT_BIAS,
+        QUANTIZER_LAYER_OUT_KERNEL,
+        QUANTIZER_LAYER_OUT_BIAS,
+        QUANTIZER_LAYER_CODEBOOK
+    };
+
+    struct residual_vector_quantize_layer {
+        struct ggml_tensor * out_proj_kernel;
+        struct ggml_tensor * out_proj_bias;
+        struct ggml_tensor * codebook;
+    };
+
+    struct residual_unit {
+        residual_unit(uint32_t padding, uint32_t dilation, uint32_t groups = 1): padding(padding), dilation(dilation), groups(groups) {}
+        struct ggml_tensor * in_alpha;
+        struct ggml_tensor * in_conv_kernel;
+        struct ggml_tensor * in_conv_bias;
+        struct ggml_tensor * out_alpha;
+        struct ggml_tensor * out_conv_kernel;
+        struct ggml_tensor * out_conv_bias;
+
+        uint32_t padding;
+        uint32_t dilation;
+        uint32_t groups;
+    };
+
+    struct layer {
+        layer(uint32_t padding, uint32_t stride, uint32_t groups = 1): padding(padding), stride(stride) {
+            for (int i = 0; i < 3; i++) {
+                residual_blocks.push_back(residual_unit{(uint32_t) pow(3, (i + 1)), (uint32_t) pow(3, i), groups});
+            }
+        }
+        struct ggml_tensor * in_alpha;
+        struct ggml_tensor * in_conv_kernel;
+        struct ggml_tensor * in_conv_bias;
+        struct ggml_tensor * noise_conv_kernel = nullptr;
+
+        uint32_t padding;
+        uint32_t stride;
+        
+        std::vector<residual_unit> residual_blocks;
+    };
+
+    void assign_to_residual_unit(tts_model * model, residual_unit & unit, std::string name, struct ggml_tensor * tensor);
+    void assign_to_layer(tts_model * model, layer & l, std::string name, struct ggml_tensor * tensor);
+    void assign_to_quantize_layer(tts_model * model, residual_vector_quantize_layer & l, std::string name, struct ggml_tensor * tensor);
+
+    struct ggml_tensor * build_residual_unit(ggml_context * ctx, struct ggml_tensor * cur, residual_unit & unit);
+    struct ggml_tensor * build_layer(ggml_context * ctx, struct ggml_tensor * cur, layer & l, struct ggml_tensor * noise = nullptr);
+    struct ggml_tensor * build_quantize_layer(ggml_context * ctx, struct ggml_tensor * cur, residual_vector_quantize_layer & l);
+}
diff --git a/src/kokoro_model.cpp b/src/kokoro_model.cpp
index dad1cf5..a4b8dfc 100644
--- a/src/kokoro_model.cpp
+++ b/src/kokoro_model.cpp
@@ -1249,7 +1249,7 @@ void kokoro_runner::prepare_post_load() {
 }
 
 void kokoro_runner::set_inputs(kokoro_ubatch & batch, uint32_t total_size) {
-	random_gen(total_size * model->up_sampling_factor * (model->harmonic_num + 1), ((float*)kctx->uv_noise_data->data) + 4);
+	random_uniform_gen(total_size * model->up_sampling_factor * (model->harmonic_num + 1), ((float*)kctx->uv_noise_data->data) + 4);
     ((float*) kctx->uv_noise_data->data)[0] = model->voice_threshold;
     ((float*) kctx->uv_noise_data->data)[1] = model->noise_std;
     ((float*) kctx->uv_noise_data->data)[2] = model->sin_amp;
diff --git a/src/kokoro_model.h b/src/kokoro_model.h
index 328150d..1985c11 100644
--- a/src/kokoro_model.h
+++ b/src/kokoro_model.h
@@ -324,7 +324,6 @@ struct kokoro_duration_context : runner_context {
 
     
     size_t  logits_size = 0; // capacity (of floats) for logits
-    float * logits      = nullptr;
     float * lens 		= nullptr;
     
     struct ggml_tensor * inp_tokens;
@@ -405,10 +404,7 @@ struct kokoro_context : runner_context {
 
     uint32_t total_duration;
     uint32_t sequence_length;
-    
-    size_t  logits_size = 0; // capacity (of floats) for logits
-    float * logits      = nullptr;
-    
+
     struct ggml_tensor * inp_tokens;
     struct ggml_tensor * duration_pred;
     struct ggml_tensor * duration_mask;
diff --git a/src/orpheus_model.cpp b/src/orpheus_model.cpp
new file mode 100644
index 0000000..dc0fa8f
--- /dev/null
+++ b/src/orpheus_model.cpp
@@ -0,0 +1,464 @@
+#include "orpheus_model.h"
+
+// These tokens and variables aren't defined in the Orpheus' model configuration but instead are defined inline in various python functions.
+// As such, they are not discoverable so defining them as unconfigurable constants should be fine.
+static constexpr std::array<std::string, 7> orpheus_voices = {"zoe", "zac","jess", "leo", "mia", "julia", "leah"};
+static constexpr std::array<uint32_t, 2> orpheus_prepended_tokens = { 128259, 128000 };
+static constexpr std::array<uint32_t, 4> orpheus_appended_tokens = { 128009, 128260, 128261, 128257 };
+
+void orpheus_model::assign_weight(std::string name, struct ggml_tensor * tensor) {
+    if (name == "norm") {
+        output_norm = ggml_dup_tensor(ctx, tensor);
+        set_tensor(output_norm, tensor);
+    } else if (name == "lm_head") {
+        head = ggml_dup_tensor(ctx, tensor);
+        set_tensor(head, tensor);
+    } else if (name == "embed_tokens") {
+        embd = ggml_dup_tensor(ctx, tensor);
+        set_tensor(embd, tensor);
+    } else if (name == "rope_frequencies") {
+        rope_frequencies = ggml_dup_tensor(ctx, tensor);
+        set_tensor(rope_frequencies, tensor);
+    } else if (has_prefix(name, "layers")) {
+        auto lpair = parse_layer_count(name);
+        int l = lpair.first;
+        std::string lt_name = lpair.second;
+        assign_to_layer(lt_name, layers[l], tensor);
+    }
+}
+
+void orpheus_model::assign_to_layer(std::string part, orpheus_layer & layer, struct ggml_tensor * tensor) {
+    if (part == ".self_attn.k_proj") {
+        layer.k = ggml_dup_tensor(ctx, tensor);
+        set_tensor(layer.k, tensor);
+    } else if (part == ".self_attn.q_proj") {
+        layer.q = ggml_dup_tensor(ctx, tensor);
+        set_tensor(layer.q, tensor);
+    } else if (part == ".self_attn.v_proj") {
+        layer.v = ggml_dup_tensor(ctx, tensor);
+        set_tensor(layer.v, tensor);
+    } else if (part == ".self_attn.o_proj") {
+        layer.o = ggml_dup_tensor(ctx, tensor);
+        set_tensor(layer.o, tensor);
+    } else if (part == ".mlp.gate_proj") {
+        layer.gate = ggml_dup_tensor(ctx, tensor);
+        set_tensor(layer.gate, tensor);
+    } else if (part == ".mlp.up_proj") {
+        layer.up = ggml_dup_tensor(ctx, tensor);
+        set_tensor(layer.up, tensor);
+    } else if (part == ".mlp.down_proj") {
+        layer.down = ggml_dup_tensor(ctx, tensor);
+        set_tensor(layer.down, tensor);
+    } else if (part == ".input_layernorm") {
+        layer.input_norm = ggml_dup_tensor(ctx, tensor);
+        set_tensor(layer.input_norm, tensor);
+    } else if (part == ".post_attention_layernorm") {
+        layer.post_attention_norm = ggml_dup_tensor(ctx, tensor);
+        set_tensor(layer.post_attention_norm, tensor);
+    }
+}
+
+void orpheus_model::prep_constants(gguf_context * meta) {
+    // get constants for orpheus
+    int vocab_size_key = gguf_find_key(meta, "orpheus.vocab_size");
+    if (vocab_size_key != -1) {
+        vocab_size = gguf_get_val_u32(meta, vocab_size_key);
+    }
+
+    int attn_heads_key = gguf_find_key(meta, "orpheus.attn_heads");
+    if (attn_heads_key != -1) {
+        n_attn_heads = gguf_get_val_u32(meta, attn_heads_key);
+    }
+
+    int kv_attn_heads_key = gguf_find_key(meta, "orpheus.kv_attn_heads");
+    if (kv_attn_heads_key != -1) {
+        n_kv_attn_heads = gguf_get_val_u32(meta, kv_attn_heads_key);
+    }
+
+    int head_size_key = gguf_find_key(meta, "orpheus.head_dim");
+    if (head_size_key != -1) {
+        head_size = gguf_get_val_u32(meta, head_size_key);
+    }
+
+    int stopping_token_key = gguf_find_key(meta, "orpheus.stopping_token_id");
+    if (stopping_token_key != -1) {
+        stopping_token_id = gguf_get_val_u32(meta, stopping_token_key);;
+    }
+
+    int eos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.eos_token_id");
+    if (eos_token_id_key != -1) {
+        eos_token_id = gguf_get_val_u32(meta, eos_token_id_key);
+    }
+
+    int bos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.bos_token_id");
+    if (bos_token_id_key != -1) {
+        bos_token_id = gguf_get_val_u32(meta, bos_token_id_key);
+    }
+
+    int hidden_size_key = gguf_find_key(meta, "orpheus.hidden_size");
+    if (hidden_size_key != -1) {
+        hidden_size = gguf_get_val_u32(meta, hidden_size_key);
+    }
+
+    int kv_hidden_size_key = gguf_find_key(meta, "orpheus.kv_hidden_size");
+    if (kv_hidden_size_key != -1) {
+        kv_hidden_size = gguf_get_val_u32(meta, kv_hidden_size_key);
+    }
+}
+
+void orpheus_model::prep_layers(gguf_context * meta) {
+    int n_layers_key = gguf_find_key(meta, "orpheus.layers");
+    if (n_layers_key == -1) {
+        TTS_ABORT("the 'orpheus.layers' must be specified in the GGUF file.");
+    }
+    n_layers = (int) gguf_get_val_u32(meta, n_layers_key);
+    for (int i = 0; i < n_layers; i++) {
+        layers.push_back(orpheus_layer{});
+    }
+}
+
+struct ggml_tensor * orpheus_build_layer_norm(ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * weight) {
+    float eps = 0.00001;
+    return ggml_mul(ctx, ggml_rms_norm(ctx, x, eps), weight);
+}
+
+struct ggml_tensor * build_attn_mask(ggml_context * ctx, orpheus_context * octx, orpheus_ubatch & batch) {
+    octx->attn_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, (int64_t) octx->current_position + batch.n_tokens, (int64_t) octx->current_position + batch.n_tokens);
+    ggml_set_input(octx->attn_mask);
+    return octx->attn_mask;
+}
+
+ void orpheus_context::reset() {
+    output_tokens.clear();
+    current_position = 0;
+    n_outputs = 0;
+ }
+
+orpheus_context * build_new_orpheus_context(orpheus_model * model, int n_threads, bool use_cpu) {
+    orpheus_context * octx = new orpheus_context(model, n_threads);
+    if (!use_cpu) {
+#ifdef GGML_USE_METAL
+        octx->backend = ggml_backend_metal_init();
+#endif
+    }
+    octx->backend_cpu = ggml_backend_cpu_init();
+    octx->set_threads();
+    octx->build_schedule();
+    octx->buf_compute_meta.resize(ggml_tensor_overhead()*model->max_nodes() + ggml_graph_overhead_custom(model->max_nodes(), false));
+    return octx;
+}
+
+void orpheus_runner::orpheus_kv_cache_init() {    
+    ggml_backend_buffer_type_t buft = nullptr;
+    if (octx->backend != nullptr) {
+#ifdef GGML_USE_METAL
+        buft = ggml_backend_metal_buffer_type();
+#endif
+    } else {
+        buft = ggml_backend_cpu_buffer_type();
+    }
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ (2u * model->layers.size() + 1)*ggml_tensor_overhead(),
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ true,
+    };
+    ggml_context * ctx = ggml_init(params);
+    if (!ctx) {
+        TTS_ABORT("%s: failed to initialze ggml context for key value cache.\n", __func__);
+    }
+    if (!kv_self) {
+        kv_self = new orpheus_kv_cache;
+    }
+    kv_self->ctx = ctx;
+    kv_self->k_l.reserve(model->layers.size());
+    kv_self->v_l.reserve(model->layers.size());
+
+    for (int i = 0; i < (int) model->layers.size(); i++) {
+        ggml_tensor * k = ggml_new_tensor_1d(kv_self->ctx, kv_self->cache_type, model->hidden_size * (model->max_context_length + model->max_generation_size));
+        ggml_tensor * v = ggml_new_tensor_1d(kv_self->ctx, kv_self->cache_type, model->hidden_size * (model->max_context_length + model->max_generation_size));
+        ggml_format_name(k, "cache_k_l%d", i);
+        ggml_format_name(v, "cache_v_l%d", i);
+        kv_self->k_l.push_back(k);
+        kv_self->v_l.push_back(v);
+    }
+
+    // allocate tensors and initialize the buffers to avoid NaNs in the padding
+    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(kv_self->ctx, buft);
+    ggml_backend_buffer_clear(buf, 0);
+    kv_self->buf = buf;
+ }
+
+ void orpheus_runner::orpheus_build_kv_store(struct ggml_context * ctx, struct ggml_cgraph * graph, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, int index, uint32_t n_tokens, int repeat) {
+    k_cur = ggml_rope_ext(ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, k_cur, model->head_size, model->n_kv_attn_heads, n_tokens)), octx->positions, model->rope_frequencies, 
+                model->head_size, 2,0, 500000.0f,
+                1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
+
+    // A performance comparison between this method, i.e. performing 3 incremental copy operations in order to achieve repeat_interleave,
+    // and performing the repeat operation upfront before performign a single copy needs to be performed in order to better optimize this function.
+    // Additionally, it might be more performant for the values transposition to be performed prior to appending it to the cache, as it would save us 
+    // from incrementally larger transpositions with generation.
+    for (int i = 0; i < repeat; i++) {
+        struct ggml_tensor * k_cache_view = ggml_view_3d(
+            ctx, 
+            kv_self->k_l[index], 
+            model->head_size,
+            model->n_kv_attn_heads,
+            n_tokens, 
+            ggml_element_size(kv_self->k_l[index]) * model->head_size * repeat,
+            ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size,
+            ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size * octx->current_position + i * ggml_element_size(kv_self->k_l[index]) * model->head_size
+        );
+        ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
+
+        struct ggml_tensor * v_cache_view = ggml_view_3d(
+            ctx,
+            kv_self->v_l[index],
+            model->head_size,
+            model->n_kv_attn_heads,
+            n_tokens,
+            ggml_element_size(kv_self->k_l[index]) * model->head_size * repeat,
+            ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size,
+            ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size * octx->current_position + i * ggml_element_size(kv_self->k_l[index]) * model->head_size
+        );
+        ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
+    }
+}
+
+struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch) {
+    init_build();
+    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false);
+    
+    struct ggml_tensor * cur;
+    struct ggml_tensor * inpL;
+    
+    const int32_t full_sequence_length = octx->current_position + (uint32_t) batch.n_tokens;
+    octx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
+    ggml_set_input(octx->positions);
+    octx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
+    ggml_set_input(octx->inp_tokens);
+    inpL = ggml_get_rows(ctx, model->embd, octx->inp_tokens);
+    
+    struct ggml_tensor * KQ_mask_dec = build_attn_mask(ctx, octx, batch);
+    
+    for (int l = 0; l < model->n_layers; l++) {
+        struct ggml_tensor * residual = inpL;
+        cur = orpheus_build_layer_norm(ctx, inpL, model->layers[l].input_norm);
+
+        struct ggml_tensor * attn_out;
+
+        // self-attention
+        {
+            struct ggml_tensor * Qcur = ggml_mul_mat(ctx, model->layers[l].q, cur);
+            struct ggml_tensor * Kcur = ggml_mul_mat(ctx, model->layers[l].k, cur);
+            struct ggml_tensor * Vcur = ggml_mul_mat(ctx, model->layers[l].v, cur);
+
+            orpheus_build_kv_store(ctx, gf, Kcur, Vcur, l, batch.n_tokens, 3);
+            struct ggml_tensor * k =
+                ggml_cont(ctx, ggml_view_3d(ctx, kv_self->k_l[l],
+                        model->head_size, full_sequence_length, model->n_attn_heads,
+                        ggml_element_size(kv_self->k_l[l]) * model->n_attn_heads * model->head_size,
+                        ggml_element_size(kv_self->k_l[l]) * model->head_size,
+                        0));            
+            
+            struct ggml_tensor * v =
+                ggml_view_2d(ctx, kv_self->v_l[l],
+                        model->hidden_size, full_sequence_length,
+                        ggml_element_size(kv_self->k_l[l]) * model->hidden_size,
+                        0);
+
+            v = ggml_cont_3d(ctx, ggml_transpose(ctx, v), full_sequence_length, model->head_size, model->n_attn_heads);
+
+            Qcur = ggml_rope_ext(
+                ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, Qcur, model->head_size, model->n_attn_heads, batch.n_tokens)), 
+                octx->positions, model->rope_frequencies, model->head_size, 2, 0, 500000.0f, // rope theta
+                1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
+
+            struct ggml_tensor * q = ggml_cont(ctx, ggml_permute(ctx, Qcur, 0, 2, 1, 3));
+            struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
+            kq = ggml_soft_max_ext(ctx, kq, KQ_mask_dec, 1.0f/sqrtf(model->head_size), 0.0f);
+            struct ggml_tensor * kqv = ggml_mul_mat(ctx, kq, v);
+            struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 2, 0, 1, 3);
+            attn_out = ggml_cont_2d(ctx, kqv_merged, model->hidden_size, batch.n_tokens);
+            attn_out = ggml_mul_mat(ctx, model->layers[l].o, attn_out);
+        }
+
+        cur = ggml_add(ctx, attn_out, residual);
+        
+        struct ggml_tensor * residualffn = cur;
+
+        // mlp
+        {
+            cur = orpheus_build_layer_norm(ctx, cur, model->layers[l].post_attention_norm);
+            cur = ggml_mul(ctx, ggml_silu(ctx, ggml_mul_mat(ctx, model->layers[l].gate, cur)), ggml_mul_mat(ctx, model->layers[l].up, cur));
+            cur = ggml_mul_mat(ctx, model->layers[l].down, cur);
+        }
+        cur = ggml_add(ctx, cur, residualffn);
+        inpL = cur;
+    }
+    
+    cur = orpheus_build_layer_norm(ctx, cur, model->output_norm);
+    // only about 40k of the output head is actually uses for generation purposes. Ideally the head tensor should be shrunk and sampled tokens should be incremented.
+    cur = ggml_mul_mat(ctx, model->head, cur);
+    if (batch.n_tokens > 1) {
+        cur = ggml_cont(ctx, ggml_view_1d(ctx, cur, model->vocab_size, ggml_element_size(cur) * (cur->ne[1] - 1) * model->vocab_size));
+    }
+    ggml_build_forward_expand(gf, cur);
+    free_build();
+    
+    return gf;
+}
+
+void orpheus_runner::decode(orpheus_ubatch & batch) {
+    ggml_backend_sched_reset(octx->sched);
+    
+    octx->output_tokens.reserve(model->max_generation_size);
+    
+    const size_t new_size  = model->vocab_size * model->max_generation_size * sizeof(float);
+    octx->prep_output_buffer(new_size);
+
+    ggml_cgraph * gf = build_orpheus_graph(batch);
+
+    // the output is always the last tensor in the graph
+    struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
+    ggml_backend_sched_alloc_graph(octx->sched, gf);
+    
+    set_inputs(batch);
+    ggml_backend_sched_graph_compute_async(octx->sched, gf);
+ 
+    float * logits_out = octx->logits + octx->n_outputs * model->vocab_size;
+    octx->get_ggml_node_data(res, logits_out, model->vocab_size * sizeof(float));
+
+    // update the total number of outputs retrieved and the current position
+    octx->current_position += batch.n_tokens;
+
+    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
+    // overlap with device computation.
+    ggml_backend_sched_reset(octx->sched);
+}
+
+void orpheus_runner::set_inputs(orpheus_ubatch & batch) {
+    ggml_backend_tensor_set(octx->inp_tokens, batch.tokens.data(), 0, batch.tokens.size()*ggml_element_size(octx->inp_tokens));
+    int32_t * pos = (int32_t*) octx->positions->data;
+    float * mask = (float*) octx->attn_mask->data;
+    uint32_t max_pos = octx->current_position + batch.n_tokens;
+    for (int i = 0; i < batch.n_tokens; i++) {
+        pos[i] = (int32_t) octx->current_position + i;
+        for (int ii = 0; ii < max_pos; ii++) {
+            mask[i*max_pos + ii] = ii > pos[i] ? -INFINITY : 0.0f;
+        }
+    }
+}
+
+orpheus_ubatch orpheus_runner::batch_from_sentence(std::string sentence) {
+    struct orpheus_ubatch batch;
+    for (auto t : orpheus_prepended_tokens) {
+        batch.tokens.push_back(t);
+    }
+    if (!octx->voice.empty()) {
+        sentence = octx->voice  + ": " + sentence;
+    }
+    tokenizer->tokenize(sentence, batch.tokens);
+    for (auto t : orpheus_appended_tokens) {
+        batch.tokens.push_back(t);
+    }
+    batch.n_tokens = batch.tokens.size();
+    return batch;
+}
+
+std::vector<std::vector<uint32_t>> orpheus_runner::prepare_output_tokens() {
+    size_t chunks = octx->output_tokens.size() / 7;
+    std::vector<std::vector<uint32_t>> output_tokens;
+    for (int i = 0; i < model->audio_heads; i++) {
+        output_tokens.push_back(std::vector<uint32_t>{});
+    }
+    for (int i = 0; i < chunks; i++) {
+        for (int ii = 0; ii < 7; ii++) {
+            uint32_t thead = model->heads[ii];
+            // the manipulations below are not configured because they are performed inline via undocumented constants in the Orpheus codebase.
+            // Essentially this is how Orpheus converts discrete samples from the output shape to the audio input shape.
+            uint32_t t = octx->output_tokens[i*7 + ii] - 128266 - ((ii % 7) * 4096);
+            output_tokens[thead].push_back(t);
+        }
+    }
+    return output_tokens;
+}
+
+void orpheus_runner::generate_from_batch(orpheus_ubatch & batch, struct tts_response * output) {
+    while ((octx->output_tokens.size() == 0 || octx->output_tokens.back() != model->stopping_token_id) && octx->output_tokens.size() < model->max_generation_size) {
+        decode(batch);
+        generation_sampler->sample(octx->logits + octx->n_outputs * model->vocab_size, octx->output_tokens);
+        // only increment the output count after sampling
+        octx->n_outputs++;
+        batch = orpheus_ubatch{
+            1, {octx->output_tokens.back()}
+        };
+    }
+    // this case could be better addressed by adding spliting to the generation process.
+    if (octx->output_tokens.size() >= model->max_generation_size) {
+        fprintf(stdout, "Warning: generation hit its max default length. The generated audio may not contain the entire prompt.\n");
+    }
+    std::vector<std::vector<uint32_t>> processed_output_tokens = prepare_output_tokens();
+    srunner->run(processed_output_tokens, output);
+}
+
+int orpheus_runner::generate(std::string sentence, struct tts_response * response) {
+    orpheus_ubatch batch = batch_from_sentence(sentence);
+    // it should be possible to update the max context window size, but currently it is extremely unlikely that a single prompt will
+    // surpass the default size.
+    if (batch.tokens.size() > model->max_context_length) {
+        TTS_ABORT("The prompt was too large for the default context window. Try splitting up or shortenning the prompt.");
+    }
+    octx->reset();
+    generation_sampler->reset();
+    if  (!kv_self) {
+        orpheus_kv_cache_init();
+    }
+    generate_from_batch(batch, response);
+    return 0;
+}
+
+void orpheus_runner::configure_generation(generation_configuration * config) {
+    generation_sampler->temperature = config->temperature;
+    generation_sampler->repetition_penalty = config->repetition_penalty;
+    generation_sampler->do_sample = config->sample;
+    generation_sampler->top_k = config->top_k;
+    generation_sampler->top_p = config->top_p;
+    if (std::find(orpheus_voices.begin(), orpheus_voices.end(), config->voice) == orpheus_voices.end() && !config->voice.empty()) {
+        TTS_ABORT("Voice '%s' is not a valid voice for Orpheus.", config->voice.c_str());
+    }
+    octx->voice = config->voice;
+}
+
+orpheus_ubatch orpheus_runner::build_worst_case_batch() {
+    orpheus_ubatch batch;
+    batch.n_tokens = model->max_context_length;
+    return batch;
+}
+
+void orpheus_runner::assign_weight(std::string name, ggml_tensor * tensor) {
+    if (tensor->data == NULL) {
+        return;
+    }
+
+    if (name.size() == 0) {
+        // handles the top level meta tensor
+        return;
+    }
+
+    if (name.size() > 5 && name.substr(0, 5) == "snac.") {
+        srunner->model->assign_weight(name.substr(5), tensor);
+    } else if (name.size() > 8 && name.substr(0, 8) == "orpheus.") {
+        model->assign_weight(name.substr(8), tensor);
+    } else {
+        fprintf(stdout, "Warning: function %s encountered an unhandled tensor named '%s'.\n", __func__, name.c_str());
+    }
+}
+
+void orpheus_runner::prepare_post_load() {
+    srunner->prepare_post_load();
+    orpheus_kv_cache_init();
+    auto batch = build_worst_case_batch();
+    auto gf = build_orpheus_graph(batch);
+    octx->prep_schedule(gf);
+}
diff --git a/src/orpheus_model.h b/src/orpheus_model.h
new file mode 100644
index 0000000..6edd36b
--- /dev/null
+++ b/src/orpheus_model.h
@@ -0,0 +1,145 @@
+#pragma once
+
+#include "sampler.h"
+#include "tokenizer.h"
+#include "snac_model.h"
+
+// Orpheus uses vLLM with a llama-3 architecture. The only critical difference from the normal llama architecture is the use of kv heads.
+
+struct orpheus_layer {
+    struct ggml_tensor * input_norm;
+    struct ggml_tensor * post_attention_norm;
+    struct ggml_tensor * q;
+    struct ggml_tensor * k;
+    struct ggml_tensor * v;
+    struct ggml_tensor * o;
+    struct ggml_tensor * gate;
+    struct ggml_tensor * up;
+    struct ggml_tensor * down;
+};
+
+struct orpheus_model : tts_model {
+    uint32_t vocab_size = 156940;
+    uint32_t n_attn_heads = 24;
+    uint32_t n_kv_attn_heads = 8;
+    uint32_t head_size = 128;
+    uint32_t max_context_length = 1024;
+    // the generation size is technically arbitrary as the model can handle a large context. This size comes out to being 25.6 seconds.
+    uint32_t max_generation_size = 2100;
+    uint32_t stopping_token_id = 128258;
+    uint32_t eos_token_id = 128001;
+    uint32_t bos_token_id = 128000;
+    uint32_t hidden_size = 3072;
+    uint32_t kv_hidden_size = 1024;
+    uint32_t audio_heads = 3;
+    uint32_t heads[7] = {0, 1, 2, 2, 1, 2, 2};
+
+    int n_layers = 28;
+
+    struct std::vector<orpheus_layer> layers;
+    struct ggml_tensor * head;
+    struct ggml_tensor * embd;
+    struct ggml_tensor * output_norm;
+    struct ggml_tensor * rope_frequencies;
+
+    void assign_weight(std::string name, ggml_tensor * tensor);
+    void assign_to_layer(std::string part, orpheus_layer & layer, struct ggml_tensor * tensor);
+    void prep_constants(gguf_context * meta);
+    void prep_layers(gguf_context * meta);
+    void setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only) {
+        prep_constants(meta_ctx);
+        prep_layers(meta_ctx);
+        tts_model::setup_from_file(meta_ctx, load_context, cpu_only, "orpheus", 1.30);
+    }
+};
+
+struct orpheus_context : runner_context {
+    orpheus_context(orpheus_model * model, int n_threads): runner_context(n_threads), model(model) {};
+    struct orpheus_model * model;
+
+    uint32_t current_position = 0; // current position in the active sequence
+    uint32_t n_outputs = 0; // the position of the text prompt termination (used for adjusting the cache when incrementally generating)
+    std::string voice;
+
+    std::vector<uint32_t> output_tokens;
+
+    void reset();
+    void build_schedule() {
+        runner_context::build_schedule(model->max_nodes());
+    }
+
+    struct ggml_tensor * inp_tokens;
+    struct ggml_tensor * attn_mask;
+    struct ggml_tensor * positions;
+};
+
+struct orpheus_kv_cache {    
+    ggml_type cache_type = GGML_TYPE_F32;
+
+    std::vector<struct ggml_tensor *> k_l;
+    std::vector<struct ggml_tensor *> v_l;
+
+    struct ggml_context * ctx;
+    ggml_backend_buffer_type_t buft;
+    ggml_backend_buffer_t buf;
+
+    void free() {
+        ggml_free(ctx);
+        ggml_backend_buffer_free(buf);
+    }
+
+    ~orpheus_kv_cache() {
+        free();
+    }
+};
+
+struct orpheus_context * build_new_orpheus_context(struct orpheus_model * model, int n_threads, bool use_cpu = true);
+
+struct orpheus_ubatch {
+    orpheus_ubatch() = default;
+    orpheus_ubatch(size_t n_tokens, std::vector<uint32_t> tokens): n_tokens(n_tokens), tokens(tokens) {};
+    size_t n_tokens; // total sentence tokens
+    std::vector<uint32_t> tokens;    // [n_tokens]
+};
+
+struct orpheus_runner : tts_runner {
+    orpheus_runner(
+            orpheus_model * model, 
+            snac_runner * audio_decoder, 
+            orpheus_context * octx, 
+            bpe_tokenizer * bt, 
+            sampler * samp, 
+            orpheus_kv_cache * cache): model(model), srunner(audio_decoder), octx(octx), tokenizer(bt), generation_sampler(samp), kv_self(cache) {
+        tts_runner::sampling_rate = 24000.0f;
+        generation_sampler->n_output_heads = 1;
+        generation_sampler->vocab_size = model->vocab_size;
+        generation_sampler->eos_token_id = model->eos_token_id;
+    }
+    orpheus_model * model;
+    snac_runner * srunner;
+    orpheus_context * octx;
+    bpe_tokenizer * tokenizer;
+    orpheus_kv_cache * kv_self;
+    sampler * generation_sampler;
+
+    void init_build() {
+        tts_runner::init_build(&octx->buf_compute_meta);
+    }
+
+    struct ggml_cgraph * build_orpheus_graph(orpheus_ubatch & batch);
+    void orpheus_kv_cache_init();
+    void orpheus_build_kv_store(struct ggml_context * ctx, struct ggml_cgraph * graph, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, int index, uint32_t n_tokens, int repeat);
+    void configure_generation(generation_configuration * config);
+    void assign_weight(std::string name, ggml_tensor * tensor);
+    std::vector<std::vector<uint32_t>> prepare_output_tokens();
+    orpheus_ubatch build_worst_case_batch();
+    orpheus_ubatch batch_from_sentence(std::string sentence);
+    void set_inputs(orpheus_ubatch & batch);
+    void decode(orpheus_ubatch & batch);
+    void prepare_post_load();
+    int generate(std::string sentence, struct tts_response * response);
+    void generate_from_batch(orpheus_ubatch & batch, struct tts_response * output);
+};
+
+static struct ggml_tensor * orpheus_build_layer_norm(ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * weight);
+static struct ggml_tensor * build_attn_mask(ggml_context * ctx, orpheus_context * octx, orpheus_ubatch & batch);
diff --git a/src/parler_model.h b/src/parler_model.h
index b200999..463910f 100644
--- a/src/parler_model.h
+++ b/src/parler_model.h
@@ -115,9 +115,6 @@ struct parler_context : runner_context {
     int32_t seq_id; // a unique identifier associated with the active sequence.
     
     std::vector<uint32_t> output_tokens;
-
-    size_t  logits_size = 0; // capacity (of floats) for logits
-    float * logits      = nullptr;
     
     struct ggml_tensor * inp_tokens;
     struct ggml_tensor * audio_inp_tokens;
@@ -207,7 +204,6 @@ struct parler_tts_runner : tts_runner {
         tts_runner::init_build(&pctx->buf_compute_meta);
     }
 
-
     void configure_generation(generation_configuration * config);
     void assign_weight(std::string name, ggml_tensor * tensor);
     parler_ubatch build_worst_case_batch();
diff --git a/src/snac_model.cpp b/src/snac_model.cpp
new file mode 100644
index 0000000..cfe38b3
--- /dev/null
+++ b/src/snac_model.cpp
@@ -0,0 +1,209 @@
+#include "snac_model.h"
+
+void snac_model::prep_constants(gguf_context * meta) {
+    int heads_key = gguf_find_key(meta, "snac.audio_token_channels");
+    if (heads_key != -1) {
+        n_heads = gguf_get_val_u32(meta, heads_key);
+    }
+
+    int sampling_factor_key = gguf_find_key(meta, "snac.up_sampling_factor");
+    if (sampling_factor_key != -1) {
+        up_sampling_factor = gguf_get_val_u32(meta, sampling_factor_key);
+    }
+    
+    int max_gen_key = gguf_find_key(meta, "snac.max_generation_size");
+    if (max_gen_key != -1) {
+        max_generation_size = gguf_get_val_u32(meta, max_gen_key);
+    }
+}
+
+void snac_model::prep_layers(gguf_context * meta) {
+    for (int i = 0; i < n_heads; i++) {
+        quantizer_layers.push_back(general_neural_audio_codec::residual_vector_quantize_layer{});
+    }
+    
+    for (int i = 0; i < n_layers; i++) {
+        std::string stride_key = "snac.snac_layer_stride_" + std::to_string(i);
+        std::string padding_key = "snac.snac_layer_padding_" + std::to_string(i);
+        std::string grouping_key = "snac.snac_layer_grouping_" + std::to_string(i);
+        int layer_stride_key = gguf_find_key(meta, stride_key.c_str());
+        if (layer_stride_key == -1) {
+            TTS_ABORT("key %s must be specified in gguf file inorder to initialize the SNAC audio decoder.", stride_key.c_str());
+        }        
+        int layer_padding_key = gguf_find_key(meta, padding_key.c_str());
+        if (layer_padding_key == -1) {
+            TTS_ABORT("key %s must be specified in gguf file inorder to initialize the SNAC audio decoder.", padding_key.c_str());
+        }
+        int layer_grouping_key = gguf_find_key(meta, grouping_key.c_str());
+        if (layer_grouping_key == -1) {
+            TTS_ABORT("key %s must be specified in gguf file inorder to initialize the SNAC audio decoder.", grouping_key.c_str());
+        }
+        layers.push_back(
+            general_neural_audio_codec::layer{
+                gguf_get_val_u32(meta, layer_padding_key),
+                gguf_get_val_u32(meta, layer_stride_key),
+                gguf_get_val_u32(meta, layer_grouping_key)
+            }
+        );
+    }
+}
+
+void snac_model::assign_weight(std::string name, ggml_tensor * tensor) {
+    if (name == "alpha_out") {
+        snake_alpha = ggml_dup_tensor(ctx, tensor);
+        set_tensor(snake_alpha, tensor);
+    } else if (name == "in.weight") {
+        in_conv_kernel = ggml_dup_tensor(ctx, tensor);
+        set_tensor(in_conv_kernel, tensor);
+    } else if (name == "in.bias") {
+        in_conv_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor));
+        set_tensor(in_conv_bias, tensor);
+    } else if (name == "up.weight") {
+        up_conv_kernel = ggml_dup_tensor(ctx, tensor);
+        set_tensor(up_conv_kernel, tensor);
+    } else if (name == "up.bias") {
+        up_conv_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor));
+        set_tensor(up_conv_bias, tensor);
+    } else if (name == "final.weight") {
+        out_conv_kernel = ggml_dup_tensor(ctx, tensor);
+        set_tensor(out_conv_kernel, tensor);
+    } else if (name == "final.bias") {
+        out_conv_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor));
+        set_tensor(out_conv_bias, tensor);
+    } else if (has_prefix(name, "layers")) {
+        auto pair = parse_layer_count(name);
+        int l = pair.first;
+        std::string lt_name = pair.second;
+        general_neural_audio_codec::assign_to_layer((tts_model *) this, layers[l], lt_name, tensor);
+    } else if (has_prefix(name, "quantizers")) {
+        auto pair = parse_layer_count(name);
+        int l = pair.first;
+        std::string lt_name = pair.second;
+        general_neural_audio_codec::assign_to_quantize_layer((tts_model *) this, quantizer_layers[l], lt_name, tensor);
+    }
+}
+
+static struct ggml_tensor * snac_build_audio_inputs(struct ggml_context * ctx, struct snac_context * sctx, size_t sequence_length, std::vector<general_neural_audio_codec::residual_vector_quantize_layer> layers) {
+    struct ggml_tensor * embd;
+    // these devisors represent the discreate repeats performed against each of the three input heads.
+    sctx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sequence_length / 4 + sequence_length / 2 + sequence_length);
+    ggml_set_input(sctx->inp_tokens);
+    size_t last_stride = 0;
+    for(int i = 0; i < sctx->model->n_heads; i++) {
+        auto quantize_layer = sctx->model->quantizer_layers[i];
+        struct ggml_tensor * inp_head = ggml_cont(ctx, ggml_view_1d(ctx, sctx->inp_tokens, sequence_length / sctx->model->repeats[i], last_stride));
+        last_stride += (sequence_length / sctx->model->repeats[i]) * ggml_element_size(sctx->inp_tokens);
+        struct ggml_tensor * code = general_neural_audio_codec::build_quantize_layer(ctx, inp_head, quantize_layer);
+        if (sctx->model->repeats[i] > 1) {
+            // this manipulation is equivalent to repeat_interleave against the first dimension of the tensor
+            code = ggml_repeat(ctx, ggml_cont_3d(ctx, code, 1, code->ne[0], code->ne[1]), ggml_new_tensor_3d(ctx, GGML_TYPE_F32, sctx->model->repeats[i], code->ne[0], sctx->model->embd));
+            code = ggml_cont_2d(ctx, code, sequence_length, code->ne[2]);
+        }
+        if (i == 0) {
+            embd = code;
+        } else {
+            embd = ggml_add(ctx, embd, code);
+        }
+    }
+    return embd;
+}
+
+snac_context * build_new_snac_context(struct snac_model * model, int n_threads, bool use_cpu) {
+    snac_context * sctx = new snac_context(model, n_threads);
+    if (!use_cpu) {
+#ifdef GGML_USE_METAL
+        sctx->backend = ggml_backend_metal_init();
+#endif
+    }
+    sctx->backend_cpu = ggml_backend_cpu_init();
+    sctx->set_threads();
+    sctx->build_schedule();
+    sctx->buf_compute_meta.resize(ggml_tensor_overhead()*model->max_nodes() + ggml_graph_overhead_custom(model->max_nodes(), false));
+    return sctx;
+}
+
+void snac_runner::prepare_post_load() {
+    ggml_cgraph * gf = build_snac_graph(model->max_generation_size);
+    sctx->prep_schedule(gf);
+}
+    
+struct ggml_cgraph * snac_runner::build_snac_graph(size_t sequence_length) {
+    init_build();
+    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false);
+    
+    struct ggml_tensor * cur;
+    struct ggml_tensor * inputs;
+
+    sctx->noise = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model->noise_steps_sum * sequence_length);
+    ggml_set_input(sctx->noise);
+    
+    inputs = snac_build_audio_inputs(ctx, sctx, sequence_length, model->quantizer_layers);
+    cur = ggml_conv_1d_dw(ctx, model->in_conv_kernel, inputs, 1, 3, 1);
+    cur = ggml_add(ctx, cur, model->in_conv_bias);
+    cur = ggml_conv_1d(ctx, model->up_conv_kernel, cur, 1, 0, 1);
+    cur = ggml_add(ctx, cur, model->up_conv_bias);
+    size_t noise_offset = 0;
+    for (int l = 0; l < model->layers.size(); l++) {
+        auto layer = model->layers[l]; 
+        struct ggml_tensor * noise = ggml_cont(ctx, ggml_view_1d(ctx, sctx->noise, model->noise_steps[l] * sequence_length, noise_offset));
+        noise_offset += model->noise_steps[l] * sequence_length * sizeof(float);
+        cur = general_neural_audio_codec::build_layer(ctx, cur, layer, noise);
+    }
+    cur = snake_1d(ctx, model->snake_alpha, cur);
+    cur = ggml_conv_1d(ctx, model->out_conv_kernel, cur, 1, 3, 1);
+    cur = ggml_add(ctx, cur, model->out_conv_bias);
+    cur = ggml_tanh(ctx, cur);
+    ggml_build_forward_expand(gf, cur);
+    free_build();
+    return gf;
+}
+
+void snac_runner::set_inputs(std::vector<std::vector<uint32_t>> & tokens) {
+    ggml_backend_tensor_set(
+        sctx->inp_tokens, tokens[0].data(), 0, 
+        tokens[0].size()*ggml_element_size(sctx->inp_tokens)
+    );
+
+    ggml_backend_tensor_set(
+        sctx->inp_tokens, tokens[1].data(), tokens[0].size() * ggml_element_size(sctx->inp_tokens), 
+        tokens[1].size() * ggml_element_size(sctx->inp_tokens)
+    );
+
+    ggml_backend_tensor_set(
+        sctx->inp_tokens, tokens[2].data(), 
+        tokens[1].size()*ggml_element_size(sctx->inp_tokens)+tokens[0].size()*ggml_element_size(sctx->inp_tokens), 
+        tokens[2].size()*ggml_element_size(sctx->inp_tokens)
+    );
+    size_t sequence_length = tokens[2].size();
+    random_normal_gen(model->noise_steps_sum * sequence_length, (float*) sctx->noise->data);
+}
+
+void snac_runner::run(std::vector<std::vector<uint32_t>> & tokens, struct tts_response * outputs) {
+    size_t sequence_length = tokens[2].size();
+    ggml_backend_sched_reset(sctx->sched);
+    
+    sctx->prep_output_buffer(model->max_generation_size * model->up_sampling_factor * sizeof(float));
+    
+    outputs->data = sctx->logits;
+    ggml_backend_buffer_clear(sctx->buf_output, 0);
+    
+    struct ggml_cgraph * gf = NULL;
+    gf = build_snac_graph(sequence_length);
+    
+    // the output is always the last tensor in the graph
+    struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1];
+    ggml_backend_sched_alloc_graph(sctx->sched, gf);
+
+    set_inputs(tokens);
+
+    ggml_backend_sched_graph_compute_async(sctx->sched, gf);
+
+    sctx->get_ggml_node_data(result, outputs->data, sequence_length*sizeof(float)*model->up_sampling_factor);
+
+    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
+    // overlap with device computation.
+    ggml_backend_sched_reset(sctx->sched);
+    outputs->n_outputs = sequence_length * model->up_sampling_factor;
+    return;
+}
+
diff --git a/src/snac_model.h b/src/snac_model.h
new file mode 100644
index 0000000..9450c1b
--- /dev/null
+++ b/src/snac_model.h
@@ -0,0 +1,86 @@
+#pragma once
+
+#include "general_neural_audio_codec.h"
+
+// SNAC, Scale Neural Audio Codec, is another neural audio codec much like DAC.
+// The key differences are that it uses grouping in the residual units of its layers,
+// performs a repeat_interleave over the second and third input channels, applies 
+// a noise convolutional layer after input encoding for each layer, and applies
+// an extra convolutional layer before residual layers are applied.
+struct snac_model : tts_model {
+    // general configuration from SNAC as used by Orpheus
+    uint32_t n_layers = 4;
+    uint32_t n_heads = 3;
+    uint32_t up_sampling_factor = 512;
+    uint32_t embd = 768;
+    size_t max_generation_size = 2580;
+    uint32_t repeats[3] = {4, 2, 1};
+    // configuration for adding noise
+    uint32_t noise_steps[4] = {8, 64, 256, 512};
+    uint32_t noise_steps_sum = 840;
+    bool use_noise = true;
+    
+    struct ggml_tensor * repeat_interleave_buffer;
+
+    struct ggml_tensor * in_conv_kernel;
+    struct ggml_tensor * in_conv_bias;
+    struct ggml_tensor * up_conv_kernel;
+    struct ggml_tensor * up_conv_bias;
+    struct ggml_tensor * out_conv_kernel;
+    struct ggml_tensor * out_conv_bias;
+    struct ggml_tensor * snake_alpha;
+    std::vector<general_neural_audio_codec::layer> layers;
+    std::vector<general_neural_audio_codec::residual_vector_quantize_layer> quantizer_layers;
+
+    void assign_weight(std::string name, ggml_tensor * weight);
+    void prep_constants(gguf_context * meta);
+    void prep_layers(gguf_context * meta);
+    void post_load_assign();
+    void setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only) {
+        prep_layers(meta_ctx);
+        prep_constants(meta_ctx);
+        tts_model::setup_from_file(meta_ctx, load_context, cpu_only, "snac");
+    }
+};
+
+// the context used for running the snac model
+struct snac_context : runner_context {
+    snac_context(snac_model * model, int n_threads): runner_context(n_threads), model(model) {};
+    
+    struct snac_model * model;
+        
+    struct ggml_tensor * inp_tokens;
+    struct ggml_tensor * noise;
+    
+    void build_schedule() {
+        runner_context::build_schedule(model->max_nodes());
+    }
+};
+
+snac_context * build_new_snac_context(struct snac_model * model, int n_threads, bool use_cpu = true);
+
+static struct ggml_tensor * snac_build_audio_inputs(struct ggml_context * ctx, struct snac_context * sctx, size_t sequence_length, std::vector<general_neural_audio_codec::residual_vector_quantize_layer> layers);
+
+// This struct is intended to manage the snac model's graph compilation and compute function.
+struct snac_runner : tts_runner {
+    snac_runner(snac_model * model, snac_context * context): model(model), sctx(context) {};
+    ~snac_runner() {
+        if (ctx) {
+            ggml_free(ctx);
+        }
+        model->free();
+        delete model;
+        delete sctx;
+    }
+    snac_model * model;
+    snac_context * sctx;
+    
+    void init_build() {
+        tts_runner::init_build(&sctx->buf_compute_meta);
+    }
+    
+    void set_inputs(std::vector<std::vector<uint32_t>> & tokens);
+    void prepare_post_load();
+    struct ggml_cgraph * build_snac_graph(size_t sequence_length);
+    void run(std::vector<std::vector<uint32_t>> & tokens, struct tts_response * outputs);
+};
diff --git a/src/t5_encoder_model.h b/src/t5_encoder_model.h
index c155ec9..9a80187 100644
--- a/src/t5_encoder_model.h
+++ b/src/t5_encoder_model.h
@@ -78,9 +78,6 @@ struct t5_context : runner_context {
     
     struct t5_encoder * model;
     
-    size_t  logits_size = 0; // capacity (of floats) for logits
-    float * logits      = nullptr;
-    
     struct ggml_tensor * inp_tokens;
     struct ggml_tensor * positions;
     struct ggml_tensor * attn_mask;
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index b07ebd2..5663613 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -206,3 +206,126 @@ struct single_pass_tokenizer * single_pass_tokenizer_from_gguf(gguf_context * me
     return new single_pass_tokenizer(tokens);
 }
 
+void bpe_symbol::add_merges(std::priority_queue<bpe_merge, std::vector<bpe_merge>, bpe_merge_comp> & merges, std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> & rank_map, bool only_forward) {
+    if (!only_forward && last) {
+        auto rid = std::make_pair<std::string, std::string>(last->as_str(), as_str());
+        if (rank_map.find(rid) != rank_map.end()) {
+            bpe_merge m{last, this, rank_map[rid], last->size + size};
+            merges.push(m);
+        }
+    }
+
+    if (next) {
+        auto rid = std::make_pair<std::string, std::string>(as_str(), next->as_str());    
+        if (rank_map.find(rid) != rank_map.end()) {
+            bpe_merge m{this, next, rank_map[rid], size + next->size};
+            merges.push(m);
+        }
+    }
+}
+
+std::string bpe_symbol::as_str() {
+    return std::string(token, size);
+}
+
+bool bpe_merge_comp::operator() (const bpe_merge & a, const bpe_merge & b) {
+    return a.rank > b.rank || (a.rank == b.rank && a.a && b.a && a.a->pos > b.a->pos);
+}
+
+size_t pair_hash::operator() (const std::pair<std::string, std::string> & p) const {
+    return std::hash<std::string>{}(p.first) ^ (std::hash<std::string>{}(p.second) << 1);
+}
+
+bpe_symbol * bpe_merge::merge() {
+    a->size += b->size;
+    b->size = -1;
+    a->next = b->next;
+    if (a->next) {
+        a->next->last = a;
+    }
+    return a;
+}
+
+void pair_builder::join_pairs(std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> & rank_map) {
+    std::priority_queue<bpe_merge, std::vector<bpe_merge>, bpe_merge_comp> merges;
+    for (auto part : parts) {
+        part->add_merges(merges, rank_map, true);
+    }
+    while (!merges.empty()) {
+        auto m = merges.top();
+        merges.pop();
+        if (m.a->size > 0 && m.b->size > 0 && m.new_size == m.a->size + m.b->size) {
+            m.merge();
+            m.a->add_merges(merges, rank_map);
+        }
+
+    }
+}
+
+void bpe_tokenizer::tokenize(const std::string & text, std::vector<uint32_t> & token_ids) {
+    std::vector<std::string> chunks = split(text, " ", true);
+    bool space_prior = false;
+    for (auto chunk : chunks) {
+        if (chunk != " ") {
+            bpe_tokenize(space_prior ? "Ġ" + chunk : chunk, token_ids);
+        } else {
+            space_prior = true;
+        }
+    }
+}
+
+void bpe_tokenizer::bpe_tokenize(std::string chunk, std::vector<uint32_t> & token_ids) {
+    if (tokens_to_ids.find(chunk) != tokens_to_ids.end()) {
+        token_ids.push_back(tokens_to_ids[chunk]);
+        return;
+    }
+    auto pb = pair_builder{chunk};
+    pb.join_pairs(ranks);
+    bpe_symbol * next = pb.parts[0];
+    while (next) {
+        token_ids.push_back(tokens_to_ids[next->as_str()]);
+        next = next->next;
+    }
+}
+
+bpe_tokenizer * bpe_tokenizer_from_gguf(gguf_context * meta, std::string base_name) {
+    int vocab_key = gguf_find_key(meta, (base_name + ".tokens").c_str());
+    if (vocab_key == -1) {
+        TTS_ABORT("The '%s' key must be set in order to support BPE tokenization.", (base_name + ".tokens").c_str());
+    }
+    int merges_key = gguf_find_key(meta, (base_name + ".merges").c_str());
+    if (merges_key == -1) {
+        TTS_ABORT("The '%s' key must be set in order to support BPE tokenization.", (base_name + ".merges").c_str());
+    }
+    int eos_token_id_key = gguf_find_key(meta, (base_name + ".eos_token_id").c_str());
+    if (eos_token_id_key == -1) {
+        TTS_ABORT("The '%s' key must be set in order to support BPE tokenization.", (base_name + ".eos_token_id").c_str());
+    }
+    int bos_token_id_key = gguf_find_key(meta, (base_name + ".bos_token_id").c_str());
+    if (bos_token_id_key == -1) {
+        TTS_ABORT("The '%s' key must be set in order to support BPE tokenization.", (base_name + ".bos_token_id").c_str());
+    }
+
+    uint32_t bos_token_id = gguf_get_val_u32(meta, bos_token_id_key);
+    uint32_t eos_token_id = gguf_get_val_u32(meta, eos_token_id_key);
+
+    std::unordered_map<std::string, uint32_t> vocab;
+    int token_count = gguf_get_arr_n(meta, vocab_key);
+    for (int i = 0; i < token_count; i++) {
+        vocab[gguf_get_arr_str(meta, vocab_key, i)] = (uint32_t) i;
+    }
+
+    std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> ranks;
+    int merge_count = gguf_get_arr_n(meta, merges_key);
+
+    for (int i = 0; i < merge_count; i++) {
+        auto raw_merge = gguf_get_arr_str(meta, merges_key, i);
+        std::vector<std::string> pair = split(raw_merge, " ");
+        if (pair.size() != 2) {
+            TTS_ABORT("Invalid pair, '%s', found in BPE merges, '%s', at index %d.", raw_merge, (base_name + ".merges").c_str(), i);
+        }
+        ranks[std::make_pair<>(pair[0], pair[1])] = i;
+    }
+
+    return new bpe_tokenizer(vocab, ranks, bos_token_id, eos_token_id);
+}
diff --git a/src/tokenizer.h b/src/tokenizer.h
index 964d6f9..6216340 100644
--- a/src/tokenizer.h
+++ b/src/tokenizer.h
@@ -6,6 +6,7 @@
 #include <map>
 #include <unordered_set>
 #include <regex>
+#include <queue>
 #include "util.h"
 
 struct token_trie {
@@ -74,4 +75,80 @@ struct single_pass_tokenizer {
 
 single_pass_tokenizer * single_pass_tokenizer_from_gguf(gguf_context * meta, std::string key_name = "phonemizer.graphemes");
 
+struct bpe_symbol;
+
+struct bpe_merge {
+    bpe_symbol * a;
+    bpe_symbol * b;
+    int rank;
+    int new_size;
+
+    bpe_symbol * merge();   
+};
+
+struct bpe_merge_comp{
+    bool operator() (const bpe_merge & a, const bpe_merge & b);
+};
+
+struct pair_hash {
+    size_t operator() (const std::pair<std::string, std::string> & p) const;
+};
+
+struct bpe_symbol {
+    bpe_symbol(const char * token): token(token) {};
+    const char* token;
+    int size = 1;
+    int pos;
+    bpe_symbol * next = nullptr;
+    bpe_symbol * last = nullptr;
+
+    void add_merges(std::priority_queue<bpe_merge, std::vector<bpe_merge>, bpe_merge_comp> & merges, std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> & rank_map, bool only_forward = false);
+    std::string as_str();
+};
+
+struct pair_builder {
+    pair_builder(std::string word) {
+        bpe_symbol * last = nullptr;
+        for (int i = 0; i < word.size(); i++) {
+            int increment = 0;
+            // make sure we process each utf-8 character.
+            while(i + increment + 1 < word.size() && (word[i+increment+1] & 0b11000000) == 0b10000000) {
+                ++increment;
+            }
+            bpe_symbol * part = new bpe_symbol(word.data()+i);
+            part->pos = i;
+            part->size += increment;
+            i += increment;
+            if (last) {
+                last->next = part;
+                part->last = last;
+            }
+            last = part;
+            parts.push_back(part);
+        }
+    }
+
+    ~pair_builder() {
+        for (auto p : parts) {
+            delete p;
+        }
+    }
+
+    void join_pairs(std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> & rank_map);
+    std::vector<bpe_symbol*> parts;
+};
+
+struct bpe_tokenizer {
+    bpe_tokenizer(std::unordered_map<std::string, uint32_t> & tokens_to_ids, std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> & ranks, uint32_t bos, uint32_t eos): tokens_to_ids(tokens_to_ids), ranks(ranks), eos_token_id(eos), bos_token_id(bos) {};
+    std::unordered_map<std::string, uint32_t> tokens_to_ids;
+    std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> ranks;
+    uint32_t eos_token_id;
+    uint32_t bos_token_id;
+
+    void tokenize(const std::string & text, std::vector<uint32_t> & token_ids);
+    void bpe_tokenize(std::string chunk, std::vector<uint32_t> & token_ids);
+};
+
+bpe_tokenizer * bpe_tokenizer_from_gguf(gguf_context * meta, std::string base_name = "tokenizer.ggml");
+
 #endif
diff --git a/src/tts.cpp b/src/tts.cpp
index d426dae..348144e 100644
--- a/src/tts.cpp
+++ b/src/tts.cpp
@@ -10,6 +10,32 @@ static constexpr std::array<const char *, 5> DURATION_PREDICTOR_QUANTIZATION_COM
     "layers"
 };
 
+struct tts_runner * orpheus_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) {
+    orpheus_model * model = new orpheus_model;
+    snac_model * audio_model = new snac_model;
+    bpe_tokenizer * bt = bpe_tokenizer_from_gguf(meta_ctx);
+    model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
+    audio_model->setup_from_file(meta_ctx, weight_ctx, cpu_only);
+    sampler * samp = new sampler;
+    snac_context * sctx = build_new_snac_context(audio_model, n_threads, cpu_only);
+    snac_runner * audio_decoder = new snac_runner(audio_model, sctx);
+    orpheus_context * octx = build_new_orpheus_context(model, n_threads, cpu_only);
+    orpheus_kv_cache * cache = new orpheus_kv_cache;
+    orpheus_runner * runner = new orpheus_runner(model, audio_decoder, octx, bt, samp, cache);
+
+    for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) {
+        runner->assign_weight(cur->name, cur);
+    }
+
+    runner->prepare_post_load();
+
+    gguf_free(meta_ctx);
+    ggml_free(weight_ctx);
+    runner->arch = arch;
+
+    return (tts_runner*)runner;
+}
+
 struct tts_runner * parler_tts_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) {
     parler_tts_model * model = new parler_tts_model;
     dac_model * audio_model = new dac_model;
@@ -125,6 +151,8 @@ struct tts_runner * runner_from_file(const std::string & fname, int n_threads, g
             return kokoro_from_file(meta_ctx, weight_ctx, n_threads, config, arch_type, cpu_only);
         case DIA_ARCH:
             return dia_from_file(meta_ctx, weight_ctx, n_threads, config, arch_type, cpu_only);
+        case ORPHEUS_ARCH:
+            return orpheus_from_file(meta_ctx, weight_ctx, n_threads, config, arch_type, cpu_only);
         default:
             TTS_ABORT("%s failed for file %s. The architecture '%s' is not supported.", __func__, fname.c_str(), arch.c_str());
     }
@@ -140,6 +168,9 @@ int generate(tts_runner * runner, std::string sentence, struct tts_response * re
         case DIA_ARCH:
             ((dia_runner*)runner)->configure_generation(config);
             return ((dia_runner*)runner)->generate(sentence, response);
+        case ORPHEUS_ARCH:
+            ((orpheus_runner*)runner)->configure_generation(config);
+            return ((orpheus_runner*)runner)->generate(sentence, response);
         default:
             TTS_ABORT("%s failed. The architecture '%d' is not supported.", __func__, runner->arch);
     }
diff --git a/src/tts_model.cpp b/src/tts_model.cpp
index b6cad74..8fb8412 100644
--- a/src/tts_model.cpp
+++ b/src/tts_model.cpp
@@ -67,6 +67,19 @@ bool runner_context::prep_schedule(struct ggml_cgraph * gf) {
     return ggml_backend_sched_reserve(sched, gf);
 }
 
+void runner_context::prep_output_buffer(size_t new_size) {
+    const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output) : 0;
+    if (!buf_output || prev_size < new_size) {
+        if (buf_output) {
+            ggml_backend_buffer_free(buf_output);
+            buf_output = nullptr;
+            logits = nullptr;
+        }
+        buf_output = ggml_backend_buft_alloc_buffer(backend_cpu_buffer, new_size);
+    }
+    logits = (float *) ggml_backend_buffer_get_base(buf_output);
+}
+
 void tts_runner::init_build(std::vector<uint8_t>* buf_compute_meta) {
     struct ggml_init_params params = {
         /*.mem_size   =*/ buf_compute_meta->size(),
diff --git a/src/tts_model.h b/src/tts_model.h
index 6eb59de..93d0a21 100644
--- a/src/tts_model.h
+++ b/src/tts_model.h
@@ -11,7 +11,7 @@ void append_to_response(struct tts_response * response, struct tts_response * to
 using tensor_meta_callback = std::function<void(ggml_tensor*)>*;
 
 struct runner_context {
-	runner_context(int n_threads): n_threads(n_threads) {};
+    runner_context(int n_threads): n_threads(n_threads) {};
     virtual ~runner_context() {
         ggml_backend_sched_free(sched);
         ggml_threadpool_free(threadpool);
@@ -30,16 +30,18 @@ struct runner_context {
     ggml_backend_buffer_t buf_output = nullptr;
     ggml_backend_sched_t sched = nullptr;
     ggml_threadpool_t threadpool = nullptr;
+    float * logits = nullptr;
     int n_threads;
 
     void get_ggml_node_data(struct ggml_tensor * output_tensor, float * output, size_t output_size, ggml_backend_buffer_t buffer = nullptr);
     void set_threads();
     void build_schedule(size_t max_nodes);
     bool prep_schedule(ggml_cgraph * gf);
+    void prep_output_buffer(size_t new_size);
 };
 
 struct tts_model {
-	struct model_tensor_meta tensor_meta;
+    struct model_tensor_meta tensor_meta;
 
     // this is the current byte offset into the model's buffer.
     size_t offset = 0;
@@ -56,7 +58,7 @@ struct tts_model {
 
     struct ggml_context * ctx;
     
-	void prep_buffers_and_context(bool cpu_only, float size_offset, uint32_t dedicated_add_on_size);
+    void prep_buffers_and_context(bool cpu_only, float size_offset, uint32_t dedicated_add_on_size);
     void setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only, std::string model_prefix, float size_offset = 1.4, uint32_t dedicated_add_on_size = 0);
     void set_tensor(struct ggml_tensor * tensor, struct ggml_tensor * target);
     size_t max_nodes();
diff --git a/src/util.cpp b/src/util.cpp
index a5bbb4b..9068c70 100644
--- a/src/util.cpp
+++ b/src/util.cpp
@@ -63,7 +63,7 @@ int search_for_gguf_keys(gguf_context * meta, std::vector<std::string> possible_
     return gguf_key;
 }
 
-void random_gen(int count, float * tgt, float min, float max) {
+void random_uniform_gen(int count, float * tgt, float min, float max) {
     static std::default_random_engine e;
     static std::uniform_real_distribution<float> dis(min, max);
     for (int i = 0; i < count; i++) {
@@ -71,6 +71,14 @@ void random_gen(int count, float * tgt, float min, float max) {
     }
 }
 
+void random_normal_gen(int count, float * tgt, float mean, float std) {
+    static std::default_random_engine e;
+    static std::normal_distribution<float> dis(mean, std);
+    for (int i = 0; i < count; i++) {
+        tgt[i] = dis(e);
+    }
+}
+
 float round_to_float(double v) {
     return roundf(v * powl(10, 6)) / powl(10, 6);
 }
@@ -220,6 +228,11 @@ std::vector<std::string> split(std::string target, std::string split_on, bool in
                 output.push_back(target.substr(i, 1));
             }
             last = i+1;
+        } else if (i == last && split_on.find(target[i]) != std::string::npos) {
+            if (include_split_characters) {
+                output.push_back(target.substr(i, 1));
+            }
+            last = i+1;
         }
     }
     if (last < target.size()) {
@@ -242,6 +255,11 @@ std::vector<std::string> split(std::string target, const char split_on, bool inc
                 output.push_back(target.substr(i, 1));
             }
             last = i+1;
+        } else if (i == last && split_on == target[i]) {
+            if (include_split_characters) {
+                output.push_back(target.substr(i, 1));
+            }
+            last = i+1;
         }
     }
     if (last < target.size()) {
diff --git a/src/util.h b/src/util.h
index 458d080..681111e 100644
--- a/src/util.h
+++ b/src/util.h
@@ -26,7 +26,12 @@ struct model_tensor_meta {
 	size_t n_bytes = 0;
 };
 
-void random_gen(int count, float * tgt, float min = 0.0f, float max = 1.0);
+/**
+ * Both of these random fill the tgt array with count random floating point values.
+ * the default parameter values are consistent with pytorch random function defaults.
+ */
+void random_uniform_gen(int count, float * tgt, float min = 0.0f, float max = 1.0f);
+void random_normal_gen(int count, float * tgt, float mean = 0.0f, float std = 1.0f);
 
 std::pair<int, std::string> parse_layer_count(std::string name, int skip = 0);