diff --git a/README.md b/README.md index e32bc72..e88242c 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ In this endeavor, MacOS and metal support will be treated as the primary platfor | [Parler TTS Large](https://huggingface.co/parler-tts/parler-tts-large-v1)|✓|✓|✓|[here](https://huggingface.co/mmwillet2/Parler_TTS_GGUF)| | [Kokoro](https://huggingface.co/hexgrad/Kokoro-82M) |✓|✗|✓|[here](https://huggingface.co/mmwillet2/Kokoro_GGUF) | | [Dia](https://github.com/nari-labs/dia) |✓|✓|✓|[here](https://huggingface.co/mmwillet2/Dia_GGUF) | +| [Orpheus](https://github.com/canopyai/Orpheus-TTS) |✓|✗|✗|[here](https://huggingface.co/mmwillet2/Orpheus_GGUF) | Additional Model support will initially be added based on open source model performance in both the [old TTS model arena](https://huggingface.co/spaces/TTS-AGI/TTS-Arena) and [new TTS model arena](https://huggingface.co/spaces/TTS-AGI/TTS-Arena-V2) as well as the availability of said models' architectures and checkpoints. diff --git a/examples/cli/README.md b/examples/cli/README.md index 0fe687f..549c41e 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -11,7 +11,7 @@ This simple example cli tool can be used to generate speach from a text prompt a In order to get a detailed breakdown the functionality currently available you can call the cli with the `--help` parameter. This will return a breakdown of all parameters: ```bash -./cli --help +./tts-cli --help --temperature (-t): The temperature to use when generating outputs. Defaults to 1.0. @@ -52,25 +52,44 @@ In order to get a detailed breakdown the functionality currently available you c General usage should follow from these possible parameters. E.G. The following command will save generated speech to the `/tmp/test.wav` file. ```bash -./cli --model-path /model/path/to/gguf_file.gguf --prompt "I am saying some words" --save-path /tmp/test.wav +./tts-cli --model-path /model/path/to/gguf_file.gguf --prompt "I am saying some words" --save-path /tmp/test.wav ``` -#### Dia Generation Arguments +#### Dia and Orpheus Generation Arguments -Currently the default cli arguments are not aligned with Dia's default sampling settings. Specifically the temperature and topk settings should be changed to `1.3` and `35` respectively when generating with Dia like so: +Currently the default cli arguments are not aligned with Dia's or Orpheus' default sampling settings. Specifically the temperature and topk settings should be changed to `1.3` and `35` respectively when generating with Dia like so: -```base -./cli --model-path /model/path/to/Dia.gguf --prompt "[S1] Hi, I am Dia, this is how I talk." --save-path /tmp/test.wav --topk 35 --temperature 1.3 +```bash +./tts-cli --model-path /model/path/to/Dia.gguf --prompt "[S1] Hi, I am Dia, this is how I talk." --save-path /tmp/test.wav --topk 35 --temperature 1.3 ``` +and the voice, temperature, and repetition penalty setting should be changed to a valid voice (e.g. `leah`), `0.7`, and `1.1` respectively when generating with Orpheus like so: + +```bash +./tts-cli --model-path /model/path/to/Orpheus.gguf --prompt "Hi, I am Orpheus, this is how I talk." --save-path /tmp/test.wav --voice leah --temperature 0.7 --repetition-penalty 1.1 +``` + + #### Conditional Generation +Conditional generation is a Parler TTS specific behavior. + By default the Parler TTS model is saved to the GGUF format with a pre-encoded conditional prompt (i.e. a prompt used to determine how to generate speech), but if the text encoder model, the T5-Encoder model, is avaiable in gguf format (see the [python convertion scripts](../../py-gguf/README.md) for more information on how to prepare the T5-Encoder model) then a new conditional prompt can be used for generation like so: ```bash -./cli --model-path /model/path/to/gguf_file.gguf --prompt "I am saying some words" --save-path /tmp/test.wav --text-encoder-path /model/path/to/t5_encoder_file.gguf --consditional-prompt "deep voice" +./tts-cli --model-path /model/path/to/gguf_file.gguf --prompt "I am saying some words" --save-path /tmp/test.wav --text-encoder-path /model/path/to/t5_encoder_file.gguf --consditional-prompt "deep voice" ``` +#### Distinct Voice Support + +Kokoro and Orpheus both support voices which can be set via the `--voice` (`-v`) argument. Orpheus supports the following voices: + +``` +"zoe", "zac","jess", "leo", "mia", "julia", "leah" +``` + +and Kokoro supports the voices listedin the section below. + #### MultiLanguage Configuration Kokoro supports multiple langauges with distinct voices, and, by default, the standard voices are encoded in the Kokoro gguf file. Below is a list of the available voices: diff --git a/ggml b/ggml index 1e85c87..136da02 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit 1e85c87aeaa70548ad52766f1881c2f1257962e2 +Subproject commit 136da02ac32d5011cf9b46b117a0ea1be24e2bad diff --git a/include/common.h b/include/common.h index 02de8e1..fc0dcdf 100644 --- a/include/common.h +++ b/include/common.h @@ -18,12 +18,14 @@ enum tts_arch { PARLER_TTS_ARCH = 0, KOKORO_ARCH = 1, DIA_ARCH = 2, + ORPHEUS_ARCH = 3, }; const std::map SUPPORTED_ARCHITECTURES = { { "parler-tts", PARLER_TTS_ARCH }, { "kokoro", KOKORO_ARCH }, { "dia", DIA_ARCH }, + { "orpheus", ORPHEUS_ARCH } }; struct generation_configuration { diff --git a/include/tts.h b/include/tts.h index 23c55d0..def032b 100644 --- a/include/tts.h +++ b/include/tts.h @@ -4,6 +4,7 @@ #include "parler_model.h" #include "kokoro_model.h" #include "dia_model.h" +#include "orpheus_model.h" #include #include #include @@ -11,6 +12,7 @@ struct tts_runner * parler_tts_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only); struct tts_runner * kokoro_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only); struct tts_runner * dia_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only); +struct tts_runner * orpheus_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only); struct tts_runner * runner_from_file(const std::string & fname, int n_threads, generation_configuration * config, bool cpu_only = true); int generate(tts_runner * runner, std::string sentence, struct tts_response * response, generation_configuration * config); void update_conditional_prompt(tts_runner * runner, const std::string file_path, const std::string prompt, bool cpu_only = true); diff --git a/py-gguf/convert_orpheus_to_gguf b/py-gguf/convert_orpheus_to_gguf new file mode 100644 index 0000000..a2247a0 --- /dev/null +++ b/py-gguf/convert_orpheus_to_gguf @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 + +import argparse +from tts_encoders.orpheus_gguf_encoder import OrpheusEncoder, DEFAULT_ORPHEUS_REPO_ID, DEFAULT_SNAC_REPO_ID +from os.path import isdir, dirname + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument("--save-path", type=str, required=True, help="the path to save the converted gguf tts model too.") + parser.add_argument("--repo-id", type=str, required=False, default=DEFAULT_ORPHEUS_REPO_ID, help="The Huggingface repository to pull the model from.") + parser.add_argument("--snac-repo-id", type=str, required=False, default=DEFAULT_SNAC_REPO_ID, help="The Huggingface repository to pull the snac audio decoder model from.") + parser.add_argument("--never-make-dirs", default=False, action="store_true", help="When set the script will never add new directories.") + return parser.parse_known_args() + + +if __name__ == '__main__': + args, _ = parse_arguments() + if not isdir(dirname(args.save_path)) and args.never_make_dirs: + raise ValueError(f"model path, {args.save_path} is not a valid path.") + OrpheusEncoder(args.save_path, repo_id=args.repo_id).write() diff --git a/py-gguf/requirements.txt b/py-gguf/requirements.txt index 9c8f17e..0f326ad 100644 --- a/py-gguf/requirements.txt +++ b/py-gguf/requirements.txt @@ -4,8 +4,8 @@ gguf==0.10.0 spacy==3.8.5 kokoro==0.9.4 huggingface-hub>=0.26.5 -transformers>=4.43.3 -parler_tts @ git+https://github.com/huggingface/parler-tts.git@8e465f1b5fcd223478e07175cb40494d19ffbe17 +transformers>=4.46.0 +parler_tts @ git+https://github.com/huggingface/parler-tts.git@d108732cd57788ec86bc857d99a6cabd66663d68 gguf==0.10.0 safetensors==0.5.3 groovy==0.1.2 @@ -14,5 +14,7 @@ gradio-client==1.10.0 llvmlite==0.44.0 numba==0.61.2 scipy>=1.15.2 +snac==1.2.1 soundfile>=0.13.1 -nari-tts @ git+https://github.com/nari-labs/dia.git@7cf50c889c6013f74326cbdcb7696a985a4cf9c1 +nari-tts @ git+https://github.com/nari-labs/dia.git@2811af1c5f476b1f49f4744fabf56cf352be21e5 +torchvision==0.21.0 \ No newline at end of file diff --git a/py-gguf/tts_encoders/__init__.py b/py-gguf/tts_encoders/__init__.py index 6a03ada..81e5edd 100644 --- a/py-gguf/tts_encoders/__init__.py +++ b/py-gguf/tts_encoders/__init__.py @@ -5,3 +5,4 @@ from .kokoro_gguf_encoder import * from .dia_gguf_encoder import * from .dac_gguf_encoder import * +from .orpheus_gguf_encoder import * diff --git a/py-gguf/tts_encoders/dia_gguf_encoder.py b/py-gguf/tts_encoders/dia_gguf_encoder.py index dcafee8..dfa9c0b 100644 --- a/py-gguf/tts_encoders/dia_gguf_encoder.py +++ b/py-gguf/tts_encoders/dia_gguf_encoder.py @@ -82,7 +82,7 @@ def prepare_decoder_tensors(self): elif parts[0] == "norm": self.set_tensor(f"{base}.norm", param) elif parts[0] == "logits_dense": - heads = param.shape[1]; + heads = param.shape[1] for i in range(heads): head = param.data[:, i] self.set_tensor(f"{base}.heads.{i}", head.transpose(0,1)) diff --git a/py-gguf/tts_encoders/kokoro_gguf_encoder.py b/py-gguf/tts_encoders/kokoro_gguf_encoder.py index 54f4db7..ba0685f 100644 --- a/py-gguf/tts_encoders/kokoro_gguf_encoder.py +++ b/py-gguf/tts_encoders/kokoro_gguf_encoder.py @@ -96,7 +96,7 @@ class KokoroEncoder(TTSEncoder): gguf_encoder.write() ``` """ - def __init__(self, model_path: Path | str = "./kokoro.gguf", repo_id: Path | str =DEFAULT_KOKORO_REPO, + def __init__(self, model_path: Path | str = "./kokoro.gguf", repo_id: Path | str = DEFAULT_KOKORO_REPO, voices: Optional[List[str]] = None, use_espeak: bool = False, phonemizer_repo: Path | str = DEFAULT_TTS_PHONEMIZER_REPO): """ diff --git a/py-gguf/tts_encoders/orpheus_gguf_encoder.py b/py-gguf/tts_encoders/orpheus_gguf_encoder.py new file mode 100644 index 0000000..89b9317 --- /dev/null +++ b/py-gguf/tts_encoders/orpheus_gguf_encoder.py @@ -0,0 +1,244 @@ +from huggingface_hub import hf_hub_download +from pathlib import Path +from snac import SNAC +from snac.layers import DecoderBlock +from transformers import AutoModelForCausalLM +from transformers.models.llama import LlamaForCausalLM +from typing import Dict +from .dac_gguf_encoder import DAC_RESIDUAL_UNIT_PARTS +from .tts_encoder import TTSEncoder +from .tensor_util import get_normalized_weight_from_parametrizations + +import gguf +import json +import math +import torch + +DEFAULT_ORPHEUS_REPO_ID = "canopylabs/orpheus-3b-0.1-ft" +DEFAULT_SNAC_REPO_ID = "hubertsiuzdak/snac_24khz" +ORPHEUS_ARCHITECTURE = "orpheus" + + +class OrpheusEncoder(TTSEncoder): + """ + The purpose of this class is to encode and write the tensors and model configuration for the Orpheus TTS model that + into a GGUF file. + + General Usage: + + ```python + from tts_encoders import OrpheusEncoder + + gguf_encoder = OrpheusEncoder("some/local/path.gguf") + gguf_encoder.write() + ``` + """ + def __init__(self, model_path: Path | str = "./orpheus.gguf", repo_id: Path | str = DEFAULT_ORPHEUS_REPO_ID, + snac_repo_id: Path | str = DEFAULT_SNAC_REPO_ID): + """ + :param Path or str model_path: The path to save the generated GGUF file. + :param Path or str repo_id: The path or repository from which to pull the orpheus model and its tokenizer. + :param Path or str snac_repo_id: The path or repository from which to pull the SNAC audio decoder. + """ + super().__init__(model_path=model_path, architecture=ORPHEUS_ARCHITECTURE) + self._model = None + self._snac_model = None + self._tokenizer_json = None + self._config = None + self.repo_id = repo_id + self.snac_repo_id = snac_repo_id + + @property + def model(self) -> LlamaForCausalLM: + if self._model is None: + try: + self._model = AutoModelForCausalLM.from_pretrained(self.repo_id).eval().to(device="cpu") + except Exception as e: + self.logger.exception( + f"Failed with exception, {e}, when attempting to obtain Orpheus at path or repo: '{self.repo_id}'" + ) + raise e + return self._model + + @property + def snac_model(self) -> SNAC: + if self._snac_model is None: + try: + self._snac_model = SNAC.from_pretrained(self.snac_repo_id).eval().to("cpu") + except Exception as e: + self.logger.exception( + f"Failed with exception, {e}, when attempting to obtain SNAC Model at path or repo: '{self.snac_repo_id}'" + ) + raise e + return self._snac_model + + @property + def tokenizer_json(self) -> Dict: + if self._tokenizer_json is None: + try: + conf_path = hf_hub_download(repo_id=self.repo_id, filename='tokenizer.json') + except Exception as e: + self.logger.exception( + f"Failed with exception, {e}, attempting to obtain tokenizer.json via repository '{self.repo_id}'." + ) + raise e + with open(conf_path, "r+") as f: + self._tokenizer_json = json.load(f) + return self._tokenizer_json + + def simplify_snac_name(self, name: str) -> str: + parts = name.split(".") + model_index = int(parts[0]) + if model_index == 6: + return "alpha_out" + elif model_index == 7: + return f"final.{parts[1]}" + elif model_index == 0: + return f"in.{parts[1]}" + elif model_index == 1: + return f"up.{parts[1]}" + else: + model_index -= 2 + layer_index = int(parts[2]) + if layer_index == 0: + return f"layers.{model_index}.alpha" + elif layer_index == 1: + return f"layers.{model_index}.{parts[-1]}" + elif layer_index == 2: + return f"layers.{model_index}.noise_{parts[-1]}" + else: + base = f"layers.{model_index}.residual_unit.{layer_index - 3}" + return base + "." + DAC_RESIDUAL_UNIT_PARTS[".".join(parts[-3:])] + + def prepare_tensors(self): + self.prepare_orpheus_tensors() + self.prepare_snac_tensors() + self.prepare_rope_frequencies() + + def prepare_orpheus_tensors(self): + for name, param in self.model.model.named_parameters(): + name = f"orpheus.{name[:-7]}" # all names end in ".weight" for Orpheus + self.set_tensor(name, param) + self.set_tensor("orpheus.lm_head", self.model.lm_head.weight) + + def prepare_snac_tensors(self): + modules = {n: v for n, v in self.snac_model.quantizer.named_modules()} + for name, param in self.snac_model.quantizer.named_parameters(): + if "parametrizations.weight.original0" in name: + param = get_normalized_weight_from_parametrizations(modules, name) + name = name.replace("parametrizations.weight.original0", "weight") + elif "parametrizations.weight" in name: + continue + self.set_tensor(f"snac.{name}", param) + + modules = {n: v for n, v in self.snac_model.decoder.model.named_modules()} + for name, param in self.snac_model.decoder.model.named_parameters(): + if "parametrizations.weight.original0" in name: + param = get_normalized_weight_from_parametrizations(modules, name) + name = name.replace("parametrizations.weight.original0", "weight") + elif "parametrizations.weight" in name: + continue + name = self.simplify_snac_name(name) + self.set_tensor(f"snac.{name}", param) + + def prepare_rope_frequencies(self): + """ + Because Llama-3 like Rotary Positional Embeddings are not currently supported out-of-the-box in GGML, + we need to encode the rope frequency vectors to use directly. + """ + base = self.model.config.rope_theta + dim = self.model.config.head_dim + freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + factor = self.model.config.rope_scaling.get("factor", 8.0) + low_freq_factor = self.model.config.rope_scaling.get("low_freq_factor", 1.0) + high_freq_factor = self.model.config.rope_scaling.get("high_freq_factor", 4.0) + old_context_len = self.model.config.rope_scaling.get("original_max_position_embeddings", 8192) + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + assert low_freq_wavelen != high_freq_wavelen + + rope_factors = [] + for freq in freqs: + wavelen = 2 * math.pi / freq + if wavelen < high_freq_wavelen: + rope_factors.append(1) + elif wavelen > low_freq_wavelen: + rope_factors.append(factor) + else: + smooth = (old_context_len / wavelen - low_freq_factor) / ( + high_freq_factor - low_freq_factor) + rope_factors.append(1 / ((1 - smooth) / factor + smooth)) + + self.set_tensor("orpheus.rope_frequencies", torch.tensor(rope_factors, dtype=torch.float32)) + + def prepare_metadata(self): + """ + Implementation of TTSEncoder's Abstract method see TTSEncoder for more information + """ + total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count() + self.metadata = gguf.Metadata.load(None, None, self.repo_id, total_params) + + # Generate parameter weight class (useful for leader boards) if not yet determined + if self.metadata.size_label is None and total_params > 0: + self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count) + + self.set_type() + self.set_gguf_parameters() + self.metadata.set_gguf_meta_model(self.gguf_writer) + self.set_vocab() + self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) + + def set_gguf_parameters(self): + """ + The purpose of this function is to add general model configuration to the GGUF file writer. + """ + + # this is not set in Orpheus configuration or on the class level. It is passed as a + # a default parameter to the generation function. + self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.stopping_token_id", 128258) + + # ---- Orpheus configuration ---- + self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.hidden_size", self.model.config.hidden_size) + self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.vocab_size", self.model.config.vocab_size) + self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.attn_heads", self.model.config.num_attention_heads) + self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.kv_attn_heads", self.model.config.num_key_value_heads) + self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.head_dim", self.model.config.head_dim) + self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.layers", self.model.config.num_hidden_layers) + self.gguf_writer.add_uint32( + f"{self.gguf_writer.arch}.kv_hidden_size", + self.model.config.hidden_size // (self.model.config.num_attention_heads // self.model.config.num_key_value_heads) + ) + + # ---- SNAC configuration ---- + self.gguf_writer.add_uint32("snac.audio_token_channels", self.snac_model.quantizer.n_codebooks) + layer_index = 0 + for module in self.snac_model.decoder.model: + if isinstance(module, DecoderBlock): + self.gguf_writer.add_uint32(f"snac.snac_layer_stride_{layer_index}", module.block[1].stride[0]) + self.gguf_writer.add_uint32(f"snac.snac_layer_padding_{layer_index}", module.block[1].padding[0]) + self.gguf_writer.add_uint32(f"snac.snac_layer_grouping_{layer_index}", module.block[3].block[1].groups) + layer_index += 1 + + # The file type setting is purely for describing the primary precision of the model as it is stored in the GGUF file. + # This setting *does not* enforce the tensor format or alter tensor processing capabilities in TTS.cpp and is only + # used for reporting. + self.gguf_writer.add_file_type(gguf.LlamaFileType.ALL_F32) + + def set_type(self): + self.gguf_writer.add_type(gguf.GGUFType.MODEL) + + def set_vocab(self): + """ + The purpose of this function is to add the vocab, merges, and configuration for Orpheus' BPE tokenizer + to the GGUF file writer. + """ + assert "model" in self.tokenizer_json and "type" in self.tokenizer_json["model"] and self.tokenizer_json["model"]["type"] == "BPE" \ + and "merges" in self.tokenizer_json["model"] and "vocab" in self.tokenizer_json["model"] + tokens = list(self.tokenizer_json["model"]["vocab"].keys()) + print(f"HERE WTF is going on {len(tokens)}.") + merges = [" ".join(pair) for pair in self.tokenizer_json["model"]["merges"]] + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_merges(merges) + self.gguf_writer.add_eos_token_id(self.model.config.eos_token_id) + self.gguf_writer.add_bos_token_id(self.model.config.bos_token_id) diff --git a/py-gguf/tts_encoders/tensor_util.py b/py-gguf/tts_encoders/tensor_util.py index f546ddd..e5b8773 100644 --- a/py-gguf/tts_encoders/tensor_util.py +++ b/py-gguf/tts_encoders/tensor_util.py @@ -1,4 +1,6 @@ +import torch import torch.nn as nn +from torch.nn.utils.parametrizations import _WeightNorm from torch.nn.utils.weight_norm import WeightNorm from typing import Dict @@ -23,3 +25,26 @@ def get_regularized_weight(modules: Dict[str, nn.Module], parameter_name: str) - hook(module, None) break return module.weight + + +def get_normalized_weight_from_parametrizations(modules: Dict[str, nn.Module], parameter_name: str) -> torch.Tensor: + """ + Attempts to call the default parametrization forward pass for weight normalization such that the true weight + can be determined via the stored parametrized variables. + + :param Dict[str, nn.Module] modules: a dictionary containing modules belonging to the current module context by name + :param str parameter_name: the base parameter name from which the normalized weight is to be derived. + :return torch.Tensor: the computed normalized weight tensor. + """ + parent_module_name = parameter_name.split(".parametrizations")[0] + if parent_module_name not in modules: + raise KeyError(f"Failed to find module, {parent_module_name}, for parameter, {parameter_name}, in modules dictionary.") + module = modules[parent_module_name] + if "weight" not in module.parametrizations: + raise KeyError(f"Failed to find parameterized weight on module, {parent_module_name}, for parameter, {parameter_name}.") + assert isinstance(module.parametrizations["weight"][0], _WeightNorm) + return torch._weight_norm( + module.parametrizations["weight"].original1, + module.parametrizations["weight"].original0, + module.parametrizations["weight"][0].dim + ) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6244815..3d07940 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -23,6 +23,9 @@ add_library(tts tts_model.cpp kokoro_model.cpp dia_model.cpp + orpheus_model.cpp + snac_model.cpp + general_neural_audio_codec.cpp ) target_include_directories(tts PUBLIC . ../include ../ggml/src/) diff --git a/src/dac_model.cpp b/src/dac_model.cpp index d53b7ff..6685007 100644 --- a/src/dac_model.cpp +++ b/src/dac_model.cpp @@ -9,26 +9,12 @@ static const std::map DAC_TENSOR_GGUF_LOOKUP = { {"final.bias", DAC_ENCODER_OUT_BIAS}, {"final.weight", DAC_ENCODER_OUT_KERNEL}, {"final.alpha", DAC_ENCODER_SNAKE_ALPHA}, - {".final.alpha", DAC_ENCODER_LAYER_SNAKE_ALPHA}, - {".final.bias", DAC_ENCODER_LAYER_OUT_BIAS}, - {".final.weight", DAC_ENCODER_LAYER_OUT_KERNEL}, - {".res.initial.alpha", DAC_ENCODER_LAYER_RES_BLK_IN_SNAKE}, - {".res.initial.bias", DAC_ENCODER_LAYER_RES_BLK_IN_BIAS}, - {".res.initial.weight", DAC_ENCODER_LAYER_RES_BLK_IN_KERNEL}, - {".res.final.alpha", DAC_ENCODER_LAYER_RES_BLK_OUT_SNAKE}, - {".res.final.bias", DAC_ENCODER_LAYER_RES_BLK_OUT_BIAS}, - {".res.final.weight", DAC_ENCODER_LAYER_RES_BLK_OUT_KERNEL}, - {".in_proj.bias", DAC_QUANTIZER_LAYER_IN_BIAS}, - {".in_proj.weight", DAC_QUANTIZER_LAYER_IN_KERNEL}, - {".out_proj.bias", DAC_QUANTIZER_LAYER_OUT_BIAS}, - {".out_proj.weight", DAC_QUANTIZER_LAYER_OUT_KERNEL}, - {".codebook.weight", DAC_QUANTIZER_LAYER_CODEBOOK}, }; void dac_model::prep_constants(gguf_context * meta) { int output_heads_key = search_for_gguf_keys(meta, {"parler-tts.decoder.output_heads", "output_heads", "dia.decoder.output_heads"}); if (output_heads_key != -1) { - n_heads = gguf_get_val_u32(meta, output_heads_key);; + n_heads = gguf_get_val_u32(meta, output_heads_key); } int sampling_factor_key = search_for_gguf_keys(meta, {"dac.up_sampling_factor", "up_sampling_factor"}); @@ -40,37 +26,30 @@ void dac_model::prep_constants(gguf_context * meta) { if (max_gen_key != -1) { max_generation_size = gguf_get_val_u32(meta, max_gen_key); } - - for (int i = 0; i < (int) layers.size(); i++) { - std::string stride_kw = "dac_layer_stride_" + std::to_string(i); - std::string padding_kw = "dac_layer_padding_" + std::to_string(i); - int layer_stride_key = search_for_gguf_keys(meta, {"dac." + stride_kw, stride_kw}); - if (layer_stride_key == -1) { - TTS_ABORT("key %s must be specified in gguf file.", ("dac." + stride_kw).c_str()); - } - layers[i].stride = gguf_get_val_u32(meta, layer_stride_key); - int layer_padding_key = search_for_gguf_keys(meta, {"dac." + padding_kw, padding_kw}); - if (layer_padding_key == -1) { - TTS_ABORT("key %s must be specified in gguf file.", ("dac." + padding_kw).c_str()); - } - layers[i].padding = gguf_get_val_u32(meta, layer_padding_key); - } } void dac_model::prep_layers(gguf_context * meta) { for (int i = 0; i < n_heads; i++) { - dac_quantize_layer l; - quantizer_layers.push_back(l); + quantizer_layers.push_back(general_neural_audio_codec::residual_vector_quantize_layer{}); } for (int i = 0; i < n_layers; i++) { - dac_layer l; - // all dac layers have 3 residual units - for (int ii = 0; ii < 3; ii++) { - dac_residual_unit u; - l.residual_blocks.push_back(u); + std::string stride_key = "dac_layer_stride_" + std::to_string(i); + std::string padding_key = "dac_layer_padding_" + std::to_string(i); + int layer_stride_key = search_for_gguf_keys(meta, {"dac." + stride_key, stride_key}); + if (layer_stride_key == -1) { + TTS_ABORT("key %s must be specified in gguf file inorder to initialize the DAC audio decoder.", stride_key.c_str()); + } + int layer_padding_key = search_for_gguf_keys(meta, {"dac." + padding_key, padding_key}); + if (layer_padding_key == -1) { + TTS_ABORT("key %s must be specified in gguf file inorder to initialize the DAC audio decoder.", padding_key.c_str()); } - layers.push_back(l); + layers.push_back( + general_neural_audio_codec::layer{ + gguf_get_val_u32(meta, layer_padding_key), + gguf_get_val_u32(meta, layer_stride_key), + } + ); } } @@ -78,95 +57,6 @@ void dac_model::assign_weight(std::string name, ggml_tensor * tensor) { assign_to_audio_encoder(this, name, tensor); } -void assign_residual_unit(dac_model * model, dac_residual_unit * l, std::string name, ggml_tensor * tensor) { - try { - dac_tensor tensor_type = DAC_TENSOR_GGUF_LOOKUP.at(name); - switch (tensor_type) { - case DAC_ENCODER_LAYER_RES_BLK_IN_SNAKE: - l->in_snake_alpha = ggml_dup_tensor(model->ctx, tensor); - model->set_tensor(l->in_snake_alpha, tensor); - break; - case DAC_ENCODER_LAYER_RES_BLK_OUT_SNAKE: - l->out_snake_alpha = ggml_dup_tensor(model->ctx, tensor); - model->set_tensor(l->out_snake_alpha, tensor); - break; - case DAC_ENCODER_LAYER_RES_BLK_IN_KERNEL: - l->in_conv_kernel = ggml_dup_tensor(model->ctx, tensor); - model->set_tensor(l->in_conv_kernel, tensor); - break; - case DAC_ENCODER_LAYER_RES_BLK_OUT_KERNEL: - l->out_conv_kernel = ggml_dup_tensor(model->ctx, tensor); - model->set_tensor(l->out_conv_kernel, tensor); - break; - case DAC_ENCODER_LAYER_RES_BLK_IN_BIAS: - l->in_conv_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor)); - model->set_tensor(l->in_conv_bias, tensor); - break; - case DAC_ENCODER_LAYER_RES_BLK_OUT_BIAS: - l->out_conv_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor)); - model->set_tensor(l->out_conv_bias, tensor); - break; - default: - fprintf(stdout, "residual unit unassigned tensor %s\n", name.c_str()); - break; - } - } catch (const std::out_of_range& e) { - TTS_ABORT("Error: %s\nTensor, '%s', is not a valid tensor.", e.what(), name.c_str()); - } - -} - -void assign_dac_layer(dac_model * model, dac_layer * layer, std::string name, ggml_tensor * tensor) { - if (DAC_TENSOR_GGUF_LOOKUP.find(name) != DAC_TENSOR_GGUF_LOOKUP.end()) { - switch(DAC_TENSOR_GGUF_LOOKUP.at(name)) { - case DAC_ENCODER_LAYER_SNAKE_ALPHA: - layer->snake_alpha_in = ggml_dup_tensor(model->ctx, tensor); - model->set_tensor(layer->snake_alpha_in, tensor); - break; - case DAC_ENCODER_LAYER_OUT_KERNEL: - layer->out_conv_kernel = ggml_dup_tensor(model->ctx, tensor); - model->set_tensor(layer->out_conv_kernel, tensor); - break; - case DAC_ENCODER_LAYER_OUT_BIAS: - layer->out_conv_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor)); - model->set_tensor(layer->out_conv_bias, tensor); - break; - default: - fprintf(stdout, "layer unassigned tensor %s\n", name.c_str()); - break; - } - } else if (std::find_if(name.begin(), name.end(), ::isdigit) != name.end()) { - auto pair = parse_layer_count(name); - int l = pair.first; - std::string lt_name = pair.second; - assign_residual_unit(model, &layer->residual_blocks[l], lt_name, tensor); - } -} - -void assign_quantizer_layer(dac_model * model, dac_quantize_layer * layer, std::string name, ggml_tensor * tensor) { - try { - switch(DAC_TENSOR_GGUF_LOOKUP.at(name)) { - case DAC_QUANTIZER_LAYER_OUT_KERNEL: - layer->out_proj_kernel = ggml_dup_tensor(model->ctx, tensor); - model->set_tensor(layer->out_proj_kernel, tensor); - break; - case DAC_QUANTIZER_LAYER_OUT_BIAS: - layer->out_proj_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor)); - model->set_tensor(layer->out_proj_bias, tensor); - break; - case DAC_QUANTIZER_LAYER_CODEBOOK: - layer->codebook = ggml_dup_tensor(model->ctx, tensor); - model->set_tensor(layer->codebook, tensor); - break; - default: - fprintf(stdout, "quantized layer unassigned tensor %s\n", name.c_str()); - break; - } - } catch (const std::out_of_range& e) { - TTS_ABORT("Error: %s\nTensor, '%s', is not a valid tensor.", e.what(), name.c_str()); - } -} - void assign_to_audio_encoder(dac_model * model, std::string name, ggml_tensor * tensor) { if (DAC_TENSOR_GGUF_LOOKUP.find(name) != DAC_TENSOR_GGUF_LOOKUP.end()) { switch(DAC_TENSOR_GGUF_LOOKUP.at(name)) { @@ -199,14 +89,14 @@ void assign_to_audio_encoder(dac_model * model, std::string name, ggml_tensor * int l = pair.first; std::string lt_name = pair.second; if (name.find("quantizers") != std::string::npos) { - assign_quantizer_layer(model, &model->quantizer_layers[l], lt_name, tensor); + general_neural_audio_codec::assign_to_quantize_layer((tts_model *) model, model->quantizer_layers[l], lt_name, tensor); } else { - assign_dac_layer(model, &model->layers[l - 1], lt_name, tensor); + general_neural_audio_codec::assign_to_layer((tts_model *) model, model->layers[l - 1], lt_name, tensor); } } } -static struct ggml_tensor * dac_build_audio_inputs(struct ggml_context * ctx, struct dac_context * dctx, const dac_ubatch & batch, std::vector layers) { +static struct ggml_tensor * dac_build_audio_inputs(struct ggml_context * ctx, struct dac_context * dctx, const dac_ubatch & batch, std::vector layers) { struct ggml_tensor * embd; dctx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.sequence_length*dctx->model->n_heads); @@ -220,10 +110,7 @@ static struct ggml_tensor * dac_build_audio_inputs(struct ggml_context * ctx, st auto quantize_layer = dctx->model->quantizer_layers[i]; struct ggml_tensor * code = ggml_cont(ctx, ggml_view_2d(ctx, dctx->inp_tokens, 1, batch.sequence_length, dctx->model->n_heads*ggml_type_size(GGML_TYPE_I32), i*ggml_type_size(GGML_TYPE_I32))); code = ggml_reshape_1d(ctx, code, batch.sequence_length); - code = ggml_get_rows(ctx, quantize_layer.codebook, code); - code = ggml_cont(ctx, ggml_transpose(ctx, code)); - code = ggml_conv_1d(ctx, quantize_layer.out_proj_kernel, code, 1, 0, 1); - code = ggml_add(ctx, code, quantize_layer.out_proj_bias); + code = general_neural_audio_codec::build_quantize_layer(ctx, code, quantize_layer); if (i == 0) { embd = code; @@ -234,27 +121,6 @@ static struct ggml_tensor * dac_build_audio_inputs(struct ggml_context * ctx, st return embd; } -static struct ggml_tensor * build_residual_unit(ggml_context * ctx, struct ggml_tensor * cur, dac_residual_unit & u, int padding, int dilation) { - struct ggml_tensor * residual = cur; - cur = snake_1d(ctx, u.in_snake_alpha, cur); - cur = ggml_conv_1d(ctx, u.in_conv_kernel, cur, 1, padding, dilation); - cur = ggml_add(ctx, cur, u.in_conv_bias); - cur = snake_1d(ctx, u.out_snake_alpha, cur); - cur = ggml_conv_1d(ctx, u.out_conv_kernel, cur, 1, 0, 1); - cur = ggml_add(ctx, cur, u.out_conv_bias); - return ggml_add(ctx, cur, residual); -} - -static struct ggml_tensor * build_decoder_block(ggml_context * ctx, struct ggml_tensor * cur, dac_layer & l, struct dac_context * dctx) { - cur = snake_1d(ctx, l.snake_alpha_in, cur); - cur = ggml_conv_transpose_1d(ctx, l.out_conv_kernel, cur, l.stride, l.padding, 1, 0, 1); - cur = ggml_add(ctx, cur, l.out_conv_bias); - for (int i = 0; i < l.residual_blocks.size(); i++) { - cur = build_residual_unit(ctx, cur, l.residual_blocks[i], pow(3, (i + 1)), pow(3, i)); - } - return cur; -} - struct dac_context * build_new_dac_context(struct dac_model * model, int n_threads, bool use_cpu) { dac_context * dctx = new dac_context(model, n_threads); if (!use_cpu) { @@ -291,7 +157,7 @@ struct ggml_cgraph * dac_runner::build_dac_graph(dac_ubatch & batch) { cur = ggml_conv_1d(ctx, model->in_conv_kernel, inputs, 1, 3, 1); cur = ggml_add(ctx, cur, model->in_conv_bias); for (auto l : model->layers) { - cur = build_decoder_block(ctx, cur, l, dctx); + cur = general_neural_audio_codec::build_layer(ctx, cur, l); } cur = snake_1d(ctx, model->snake_alpha, cur); cur = ggml_conv_1d(ctx, model->out_conv_kernel, cur, 1, 3, 1); diff --git a/src/dac_model.h b/src/dac_model.h index 6befa32..be43ad0 100644 --- a/src/dac_model.h +++ b/src/dac_model.h @@ -1,7 +1,7 @@ #ifndef dac_model_h #define dac_model_h -#include "tts_model.h" +#include "general_neural_audio_codec.h" #include enum dac_tensor { @@ -10,40 +10,6 @@ enum dac_tensor { DAC_ENCODER_OUT_KERNEL, DAC_ENCODER_OUT_BIAS, DAC_ENCODER_SNAKE_ALPHA, - DAC_ENCODER_LAYER_SNAKE_ALPHA, - DAC_ENCODER_LAYER_OUT_KERNEL, - DAC_ENCODER_LAYER_OUT_BIAS, - DAC_ENCODER_LAYER_RES_BLK_IN_SNAKE, - DAC_ENCODER_LAYER_RES_BLK_OUT_SNAKE, - DAC_ENCODER_LAYER_RES_BLK_IN_KERNEL, - DAC_ENCODER_LAYER_RES_BLK_OUT_KERNEL, - DAC_ENCODER_LAYER_RES_BLK_IN_BIAS, - DAC_ENCODER_LAYER_RES_BLK_OUT_BIAS, - DAC_QUANTIZER_LAYER_IN_KERNEL, - DAC_QUANTIZER_LAYER_IN_BIAS, - DAC_QUANTIZER_LAYER_OUT_KERNEL, - DAC_QUANTIZER_LAYER_OUT_BIAS, - DAC_QUANTIZER_LAYER_CODEBOOK -}; - -struct dac_residual_unit { - struct ggml_tensor * in_snake_alpha; - struct ggml_tensor * in_conv_kernel; - struct ggml_tensor * in_conv_bias; - struct ggml_tensor * out_snake_alpha; - struct ggml_tensor * out_conv_kernel; - struct ggml_tensor * out_conv_bias; -}; - -struct dac_layer { - struct ggml_tensor * snake_alpha_in; - struct ggml_tensor * out_conv_kernel; - struct ggml_tensor * out_conv_bias; - - uint32_t padding; - uint32_t stride; - - std::vector residual_blocks; }; struct dac_quantize_layer { @@ -52,6 +18,7 @@ struct dac_quantize_layer { struct ggml_tensor * codebook; }; +// DAC, Descript Audio Codec, is a channel token to audio autoencoder model (though we only use its decoder functionality). // this struct maintains the static tensors for the dac audio decoder graph. // As such, this is designed to contain basic configuration and ggml tensor support for DAC. // The dac_runner describes how the graph is built and run. @@ -67,8 +34,8 @@ struct dac_model : tts_model { struct ggml_tensor * out_conv_kernel; struct ggml_tensor * out_conv_bias; struct ggml_tensor * snake_alpha; - std::vector layers; - std::vector quantizer_layers; + std::vector layers; + std::vector quantizer_layers; void assign_weight(std::string name, ggml_tensor * weight); void prep_constants(gguf_context * meta); @@ -81,9 +48,6 @@ struct dac_model : tts_model { }; // for loading DAC model from gguf file -void assign_residual_unit(dac_model * model, dac_residual_unit * layer, std::string name, ggml_tensor * tensor); -void assign_dac_layer(dac_model * model, dac_layer * layer, std::string name, ggml_tensor * tensor); -void assign_quantizer_layer(dac_model * model, dac_quantize_layer layer, std::string name, ggml_tensor * tensor); void assign_to_audio_encoder(dac_model * model, std::string name, ggml_tensor * tensor); // the context used for running the dac model @@ -91,10 +55,7 @@ struct dac_context : runner_context { dac_context(dac_model * model, int n_threads): runner_context(n_threads), model(model) {}; struct dac_model * model; - - size_t logits_size = 0; // capacity (of floats) for logits - float * logits = nullptr; - + struct ggml_tensor * inp_tokens; void build_schedule() { @@ -109,9 +70,7 @@ struct dac_ubatch { uint32_t sequence_length; }; -static struct ggml_tensor * dac_build_audio_inputs(struct ggml_context * ctx, struct dac_context * dctx, const dac_ubatch & batch, std::vector layers); -static struct ggml_tensor * build_residual_unit(ggml_context * ctx, struct ggml_tensor * cur, dac_residual_unit & u, int padding, int dilation); -static struct ggml_tensor * build_decoder_block(ggml_context * ctx, struct ggml_tensor * cur, dac_layer & l, struct dac_context * dctx); +static struct ggml_tensor * dac_build_audio_inputs(struct ggml_context * ctx, struct dac_context * dctx, const dac_ubatch & batch, std::vector layers); // This struct is intended to manage the dac model's graph compilation and compute function. struct dac_runner : tts_runner { diff --git a/src/dia_model.h b/src/dia_model.h index 69ba6f6..bdca91d 100644 --- a/src/dia_model.h +++ b/src/dia_model.h @@ -99,7 +99,6 @@ struct dia_context : runner_context { uint32_t current_position = 0; // current position in the active sequence int delay_steps = -1; // the max remaining steps to take before terminating; is set after an eos token is seen on the first output channel size_t prompt_size = 0; - float * logits = nullptr; uint32_t max_generation_size; // this is set by the generation context or defaults to the config set on dia model. diff --git a/src/general_neural_audio_codec.cpp b/src/general_neural_audio_codec.cpp new file mode 100644 index 0000000..8f7893e --- /dev/null +++ b/src/general_neural_audio_codec.cpp @@ -0,0 +1,172 @@ +#include "general_neural_audio_codec.h" +#include +#include +#include + +namespace general_neural_audio_codec { + // This contains a mapping between string names and gguf_tensor enum values for the purposes of assigning the weights from a gguf file + // to the general_neural_audio_codec::layer. + // Please note that some gguf_tensor values have multiple keys; this is to support backwards compatibility with original DAC settings. + static const std::map GENERAL_NEURAL_AUDIO_CODEC_TENSOR_LOOKUP = { + {".final.alpha", LAYER_ALPHA}, + {".final.bias", LAYER_INPUT_BIAS}, + {".final.weight", LAYER_INPUT_KERNEL}, + {".alpha", LAYER_ALPHA}, + {".bias", LAYER_INPUT_BIAS}, + {".weight", LAYER_INPUT_KERNEL}, + {".noise_weight", LAYER_NOISE_KERNEL}, + {".res.initial.alpha", RESIDUAL_UNIT_INPUT_ALPHA}, + {".res.initial.bias", RESIDUAL_UNIT_INPUT_BIAS}, + {".res.initial.weight", RESIDUAL_UNIT_INPUT_KERNEL}, + {".res.final.alpha", RESIDUAL_UNIT_OUTPUT_ALPHA}, + {".res.final.bias", RESIDUAL_UNIT_OUTPUT_BIAS}, + {".res.final.weight", RESIDUAL_UNIT_OUTPUT_KERNEL}, + {".in_alpha", RESIDUAL_UNIT_INPUT_ALPHA}, + {".in_bias", RESIDUAL_UNIT_INPUT_BIAS}, + {".in_weight", RESIDUAL_UNIT_INPUT_KERNEL}, + {".out_alpha", RESIDUAL_UNIT_OUTPUT_ALPHA}, + {".out_bias", RESIDUAL_UNIT_OUTPUT_BIAS}, + {".out_weight", RESIDUAL_UNIT_OUTPUT_KERNEL}, + {".out_proj.bias", QUANTIZER_LAYER_OUT_BIAS}, + {".out_proj.weight", QUANTIZER_LAYER_OUT_KERNEL}, + {".codebook.weight", QUANTIZER_LAYER_CODEBOOK}, + }; + + void assign_to_residual_unit(tts_model * model, residual_unit & unit, std::string name, struct ggml_tensor * tensor) { + try { + gguf_tensor tensor_type = GENERAL_NEURAL_AUDIO_CODEC_TENSOR_LOOKUP.at(name); + switch (tensor_type) { + case RESIDUAL_UNIT_INPUT_ALPHA: + unit.in_alpha = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(unit.in_alpha, tensor); + break; + case RESIDUAL_UNIT_OUTPUT_ALPHA: + unit.out_alpha = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(unit.out_alpha, tensor); + break; + case RESIDUAL_UNIT_INPUT_KERNEL: + unit.in_conv_kernel = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(unit.in_conv_kernel, tensor); + break; + case RESIDUAL_UNIT_OUTPUT_KERNEL: + unit.out_conv_kernel = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(unit.out_conv_kernel, tensor); + break; + case RESIDUAL_UNIT_INPUT_BIAS: + unit.in_conv_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor)); + model->set_tensor(unit.in_conv_bias, tensor); + break; + case RESIDUAL_UNIT_OUTPUT_BIAS: + unit.out_conv_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor)); + model->set_tensor(unit.out_conv_bias, tensor); + break; + default: + fprintf(stdout, "residual unit unassigned tensor %s\n", name.c_str()); + break; + } + } catch (const std::out_of_range& e) { + TTS_ABORT("Tensor, '%s', is not a valid tensor general_neural_audio_codec::residual_unit tensor.", name.c_str()); + } + } + + void assign_to_layer(tts_model * model, layer & l, std::string name, struct ggml_tensor * tensor) { + if (GENERAL_NEURAL_AUDIO_CODEC_TENSOR_LOOKUP.find(name) != GENERAL_NEURAL_AUDIO_CODEC_TENSOR_LOOKUP.end()) { + switch(GENERAL_NEURAL_AUDIO_CODEC_TENSOR_LOOKUP.at(name)) { + case LAYER_ALPHA: + l.in_alpha = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(l.in_alpha, tensor); + break; + case LAYER_INPUT_KERNEL: + l.in_conv_kernel = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(l.in_conv_kernel, tensor); + break; + case LAYER_INPUT_BIAS: + l.in_conv_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor)); + model->set_tensor(l.in_conv_bias, tensor); + break; + case LAYER_NOISE_KERNEL: + l.noise_conv_kernel = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(l.noise_conv_kernel, tensor); + break; + default: + fprintf(stdout, "layer unassigned tensor %s\n", name.c_str()); + break; + } + } else if (std::find_if(name.begin(), name.end(), ::isdigit) != name.end()) { + auto pair = parse_layer_count(name); + int i = pair.first; + std::string lt_name = pair.second; + assign_to_residual_unit(model, l.residual_blocks[i], lt_name, tensor); + } else { + TTS_ABORT("Tensor, '%s', is not a valid tensor general_neural_audio_codec::layer tensor.", name.c_str()); + } + } + + void assign_to_quantize_layer(tts_model * model, residual_vector_quantize_layer & l, std::string name, struct ggml_tensor * tensor) { + try { + switch(GENERAL_NEURAL_AUDIO_CODEC_TENSOR_LOOKUP.at(name)) { + case QUANTIZER_LAYER_OUT_KERNEL: + l.out_proj_kernel = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(l.out_proj_kernel, tensor); + break; + case QUANTIZER_LAYER_OUT_BIAS: + l.out_proj_bias = ggml_dup_tensor(model->ctx, ggml_transpose(model->ctx, tensor)); + model->set_tensor(l.out_proj_bias, tensor); + break; + case QUANTIZER_LAYER_CODEBOOK: + l.codebook = ggml_dup_tensor(model->ctx, tensor); + model->set_tensor(l.codebook, tensor); + break; + default: + fprintf(stdout, "quantized layer unassigned tensor %s\n", name.c_str()); + break; + } + } catch (const std::out_of_range& e) { + // older GGUF files still have the unused in_proj convolutional layer, so ignore it if we find it. + if (!has_prefix(name, ".in_proj")) { + TTS_ABORT("Error: %s\nTensor, '%s', is not a valid tensor.", e.what(), name.c_str()); + } + } + } + + struct ggml_tensor * build_residual_unit(ggml_context * ctx, struct ggml_tensor * cur, residual_unit & unit) { + struct ggml_tensor * residual = cur; + cur = snake_1d(ctx, unit.in_alpha, cur); + if (unit.groups > 1) { + // depthwise 1d convolution is equivalent to convolution in which grouping is equal to filter size. + // If there is a divergence between filter size and grouping then the kernel's output filters will not be zero. + TTS_ASSERT(unit.in_conv_kernel->ne[1] == 1); + cur = ggml_conv_1d_dw(ctx, unit.in_conv_kernel, cur, 1, unit.padding, unit.dilation); + } else { + cur = ggml_conv_1d(ctx, unit.in_conv_kernel, cur, 1, unit.padding, unit.dilation); + } + cur = ggml_add(ctx, cur, unit.in_conv_bias); + cur = snake_1d(ctx, unit.out_alpha, cur); + cur = ggml_conv_1d(ctx, unit.out_conv_kernel, cur, 1, 0, 1); + cur = ggml_add(ctx, cur, unit.out_conv_bias); + return ggml_add(ctx, cur, residual); + } + + struct ggml_tensor * build_layer(ggml_context * ctx, struct ggml_tensor * cur, layer & l, struct ggml_tensor * noise) { + cur = snake_1d(ctx, l.in_alpha, cur); + cur = ggml_conv_transpose_1d(ctx, l.in_conv_kernel, cur, l.stride, l.padding, 1, 0, 1); + cur = ggml_add(ctx, cur, l.in_conv_bias); + if (l.noise_conv_kernel && noise) { + struct ggml_tensor * x = ggml_conv_1d(ctx, l.noise_conv_kernel, cur, 1, 0, 1); + x = ggml_mul(ctx, x, noise); + cur = ggml_add(ctx, cur, x); + } + for (int i = 0; i < l.residual_blocks.size(); i++) { + cur = build_residual_unit(ctx, cur, l.residual_blocks[i]); + } + return cur; + } + + struct ggml_tensor * build_quantize_layer(ggml_context * ctx, struct ggml_tensor * cur, residual_vector_quantize_layer & l) { + cur = ggml_get_rows(ctx, l.codebook, cur); + cur = ggml_cont(ctx, ggml_transpose(ctx, cur)); + cur = ggml_conv_1d(ctx, l.out_proj_kernel, cur, 1, 0, 1); + cur = ggml_add(ctx, cur, l.out_proj_bias); + return cur; + } +} diff --git a/src/general_neural_audio_codec.h b/src/general_neural_audio_codec.h new file mode 100644 index 0000000..1ec0a42 --- /dev/null +++ b/src/general_neural_audio_codec.h @@ -0,0 +1,67 @@ +#pragma once + +#include "tts_model.h" + +// This namespace implements a general abstraction of the core functionality used in common neural audio codecs like DAC and SNAC. +namespace general_neural_audio_codec { + enum gguf_tensor { + LAYER_ALPHA, + LAYER_INPUT_KERNEL, + LAYER_INPUT_BIAS, + LAYER_NOISE_KERNEL, + RESIDUAL_UNIT_INPUT_ALPHA, + RESIDUAL_UNIT_OUTPUT_ALPHA, + RESIDUAL_UNIT_INPUT_KERNEL, + RESIDUAL_UNIT_OUTPUT_KERNEL, + RESIDUAL_UNIT_INPUT_BIAS, + RESIDUAL_UNIT_OUTPUT_BIAS, + QUANTIZER_LAYER_OUT_KERNEL, + QUANTIZER_LAYER_OUT_BIAS, + QUANTIZER_LAYER_CODEBOOK + }; + + struct residual_vector_quantize_layer { + struct ggml_tensor * out_proj_kernel; + struct ggml_tensor * out_proj_bias; + struct ggml_tensor * codebook; + }; + + struct residual_unit { + residual_unit(uint32_t padding, uint32_t dilation, uint32_t groups = 1): padding(padding), dilation(dilation), groups(groups) {} + struct ggml_tensor * in_alpha; + struct ggml_tensor * in_conv_kernel; + struct ggml_tensor * in_conv_bias; + struct ggml_tensor * out_alpha; + struct ggml_tensor * out_conv_kernel; + struct ggml_tensor * out_conv_bias; + + uint32_t padding; + uint32_t dilation; + uint32_t groups; + }; + + struct layer { + layer(uint32_t padding, uint32_t stride, uint32_t groups = 1): padding(padding), stride(stride) { + for (int i = 0; i < 3; i++) { + residual_blocks.push_back(residual_unit{(uint32_t) pow(3, (i + 1)), (uint32_t) pow(3, i), groups}); + } + } + struct ggml_tensor * in_alpha; + struct ggml_tensor * in_conv_kernel; + struct ggml_tensor * in_conv_bias; + struct ggml_tensor * noise_conv_kernel = nullptr; + + uint32_t padding; + uint32_t stride; + + std::vector residual_blocks; + }; + + void assign_to_residual_unit(tts_model * model, residual_unit & unit, std::string name, struct ggml_tensor * tensor); + void assign_to_layer(tts_model * model, layer & l, std::string name, struct ggml_tensor * tensor); + void assign_to_quantize_layer(tts_model * model, residual_vector_quantize_layer & l, std::string name, struct ggml_tensor * tensor); + + struct ggml_tensor * build_residual_unit(ggml_context * ctx, struct ggml_tensor * cur, residual_unit & unit); + struct ggml_tensor * build_layer(ggml_context * ctx, struct ggml_tensor * cur, layer & l, struct ggml_tensor * noise = nullptr); + struct ggml_tensor * build_quantize_layer(ggml_context * ctx, struct ggml_tensor * cur, residual_vector_quantize_layer & l); +} diff --git a/src/kokoro_model.cpp b/src/kokoro_model.cpp index dad1cf5..a4b8dfc 100644 --- a/src/kokoro_model.cpp +++ b/src/kokoro_model.cpp @@ -1249,7 +1249,7 @@ void kokoro_runner::prepare_post_load() { } void kokoro_runner::set_inputs(kokoro_ubatch & batch, uint32_t total_size) { - random_gen(total_size * model->up_sampling_factor * (model->harmonic_num + 1), ((float*)kctx->uv_noise_data->data) + 4); + random_uniform_gen(total_size * model->up_sampling_factor * (model->harmonic_num + 1), ((float*)kctx->uv_noise_data->data) + 4); ((float*) kctx->uv_noise_data->data)[0] = model->voice_threshold; ((float*) kctx->uv_noise_data->data)[1] = model->noise_std; ((float*) kctx->uv_noise_data->data)[2] = model->sin_amp; diff --git a/src/kokoro_model.h b/src/kokoro_model.h index 328150d..1985c11 100644 --- a/src/kokoro_model.h +++ b/src/kokoro_model.h @@ -324,7 +324,6 @@ struct kokoro_duration_context : runner_context { size_t logits_size = 0; // capacity (of floats) for logits - float * logits = nullptr; float * lens = nullptr; struct ggml_tensor * inp_tokens; @@ -405,10 +404,7 @@ struct kokoro_context : runner_context { uint32_t total_duration; uint32_t sequence_length; - - size_t logits_size = 0; // capacity (of floats) for logits - float * logits = nullptr; - + struct ggml_tensor * inp_tokens; struct ggml_tensor * duration_pred; struct ggml_tensor * duration_mask; diff --git a/src/orpheus_model.cpp b/src/orpheus_model.cpp new file mode 100644 index 0000000..dc0fa8f --- /dev/null +++ b/src/orpheus_model.cpp @@ -0,0 +1,464 @@ +#include "orpheus_model.h" + +// These tokens and variables aren't defined in the Orpheus' model configuration but instead are defined inline in various python functions. +// As such, they are not discoverable so defining them as unconfigurable constants should be fine. +static constexpr std::array orpheus_voices = {"zoe", "zac","jess", "leo", "mia", "julia", "leah"}; +static constexpr std::array orpheus_prepended_tokens = { 128259, 128000 }; +static constexpr std::array orpheus_appended_tokens = { 128009, 128260, 128261, 128257 }; + +void orpheus_model::assign_weight(std::string name, struct ggml_tensor * tensor) { + if (name == "norm") { + output_norm = ggml_dup_tensor(ctx, tensor); + set_tensor(output_norm, tensor); + } else if (name == "lm_head") { + head = ggml_dup_tensor(ctx, tensor); + set_tensor(head, tensor); + } else if (name == "embed_tokens") { + embd = ggml_dup_tensor(ctx, tensor); + set_tensor(embd, tensor); + } else if (name == "rope_frequencies") { + rope_frequencies = ggml_dup_tensor(ctx, tensor); + set_tensor(rope_frequencies, tensor); + } else if (has_prefix(name, "layers")) { + auto lpair = parse_layer_count(name); + int l = lpair.first; + std::string lt_name = lpair.second; + assign_to_layer(lt_name, layers[l], tensor); + } +} + +void orpheus_model::assign_to_layer(std::string part, orpheus_layer & layer, struct ggml_tensor * tensor) { + if (part == ".self_attn.k_proj") { + layer.k = ggml_dup_tensor(ctx, tensor); + set_tensor(layer.k, tensor); + } else if (part == ".self_attn.q_proj") { + layer.q = ggml_dup_tensor(ctx, tensor); + set_tensor(layer.q, tensor); + } else if (part == ".self_attn.v_proj") { + layer.v = ggml_dup_tensor(ctx, tensor); + set_tensor(layer.v, tensor); + } else if (part == ".self_attn.o_proj") { + layer.o = ggml_dup_tensor(ctx, tensor); + set_tensor(layer.o, tensor); + } else if (part == ".mlp.gate_proj") { + layer.gate = ggml_dup_tensor(ctx, tensor); + set_tensor(layer.gate, tensor); + } else if (part == ".mlp.up_proj") { + layer.up = ggml_dup_tensor(ctx, tensor); + set_tensor(layer.up, tensor); + } else if (part == ".mlp.down_proj") { + layer.down = ggml_dup_tensor(ctx, tensor); + set_tensor(layer.down, tensor); + } else if (part == ".input_layernorm") { + layer.input_norm = ggml_dup_tensor(ctx, tensor); + set_tensor(layer.input_norm, tensor); + } else if (part == ".post_attention_layernorm") { + layer.post_attention_norm = ggml_dup_tensor(ctx, tensor); + set_tensor(layer.post_attention_norm, tensor); + } +} + +void orpheus_model::prep_constants(gguf_context * meta) { + // get constants for orpheus + int vocab_size_key = gguf_find_key(meta, "orpheus.vocab_size"); + if (vocab_size_key != -1) { + vocab_size = gguf_get_val_u32(meta, vocab_size_key); + } + + int attn_heads_key = gguf_find_key(meta, "orpheus.attn_heads"); + if (attn_heads_key != -1) { + n_attn_heads = gguf_get_val_u32(meta, attn_heads_key); + } + + int kv_attn_heads_key = gguf_find_key(meta, "orpheus.kv_attn_heads"); + if (kv_attn_heads_key != -1) { + n_kv_attn_heads = gguf_get_val_u32(meta, kv_attn_heads_key); + } + + int head_size_key = gguf_find_key(meta, "orpheus.head_dim"); + if (head_size_key != -1) { + head_size = gguf_get_val_u32(meta, head_size_key); + } + + int stopping_token_key = gguf_find_key(meta, "orpheus.stopping_token_id"); + if (stopping_token_key != -1) { + stopping_token_id = gguf_get_val_u32(meta, stopping_token_key);; + } + + int eos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.eos_token_id"); + if (eos_token_id_key != -1) { + eos_token_id = gguf_get_val_u32(meta, eos_token_id_key); + } + + int bos_token_id_key = gguf_find_key(meta, "tokenizer.ggml.bos_token_id"); + if (bos_token_id_key != -1) { + bos_token_id = gguf_get_val_u32(meta, bos_token_id_key); + } + + int hidden_size_key = gguf_find_key(meta, "orpheus.hidden_size"); + if (hidden_size_key != -1) { + hidden_size = gguf_get_val_u32(meta, hidden_size_key); + } + + int kv_hidden_size_key = gguf_find_key(meta, "orpheus.kv_hidden_size"); + if (kv_hidden_size_key != -1) { + kv_hidden_size = gguf_get_val_u32(meta, kv_hidden_size_key); + } +} + +void orpheus_model::prep_layers(gguf_context * meta) { + int n_layers_key = gguf_find_key(meta, "orpheus.layers"); + if (n_layers_key == -1) { + TTS_ABORT("the 'orpheus.layers' must be specified in the GGUF file."); + } + n_layers = (int) gguf_get_val_u32(meta, n_layers_key); + for (int i = 0; i < n_layers; i++) { + layers.push_back(orpheus_layer{}); + } +} + +struct ggml_tensor * orpheus_build_layer_norm(ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * weight) { + float eps = 0.00001; + return ggml_mul(ctx, ggml_rms_norm(ctx, x, eps), weight); +} + +struct ggml_tensor * build_attn_mask(ggml_context * ctx, orpheus_context * octx, orpheus_ubatch & batch) { + octx->attn_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, (int64_t) octx->current_position + batch.n_tokens, (int64_t) octx->current_position + batch.n_tokens); + ggml_set_input(octx->attn_mask); + return octx->attn_mask; +} + + void orpheus_context::reset() { + output_tokens.clear(); + current_position = 0; + n_outputs = 0; + } + +orpheus_context * build_new_orpheus_context(orpheus_model * model, int n_threads, bool use_cpu) { + orpheus_context * octx = new orpheus_context(model, n_threads); + if (!use_cpu) { +#ifdef GGML_USE_METAL + octx->backend = ggml_backend_metal_init(); +#endif + } + octx->backend_cpu = ggml_backend_cpu_init(); + octx->set_threads(); + octx->build_schedule(); + octx->buf_compute_meta.resize(ggml_tensor_overhead()*model->max_nodes() + ggml_graph_overhead_custom(model->max_nodes(), false)); + return octx; +} + +void orpheus_runner::orpheus_kv_cache_init() { + ggml_backend_buffer_type_t buft = nullptr; + if (octx->backend != nullptr) { +#ifdef GGML_USE_METAL + buft = ggml_backend_metal_buffer_type(); +#endif + } else { + buft = ggml_backend_cpu_buffer_type(); + } + + struct ggml_init_params params = { + /*.mem_size =*/ (2u * model->layers.size() + 1)*ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ggml_context * ctx = ggml_init(params); + if (!ctx) { + TTS_ABORT("%s: failed to initialze ggml context for key value cache.\n", __func__); + } + if (!kv_self) { + kv_self = new orpheus_kv_cache; + } + kv_self->ctx = ctx; + kv_self->k_l.reserve(model->layers.size()); + kv_self->v_l.reserve(model->layers.size()); + + for (int i = 0; i < (int) model->layers.size(); i++) { + ggml_tensor * k = ggml_new_tensor_1d(kv_self->ctx, kv_self->cache_type, model->hidden_size * (model->max_context_length + model->max_generation_size)); + ggml_tensor * v = ggml_new_tensor_1d(kv_self->ctx, kv_self->cache_type, model->hidden_size * (model->max_context_length + model->max_generation_size)); + ggml_format_name(k, "cache_k_l%d", i); + ggml_format_name(v, "cache_v_l%d", i); + kv_self->k_l.push_back(k); + kv_self->v_l.push_back(v); + } + + // allocate tensors and initialize the buffers to avoid NaNs in the padding + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(kv_self->ctx, buft); + ggml_backend_buffer_clear(buf, 0); + kv_self->buf = buf; + } + + void orpheus_runner::orpheus_build_kv_store(struct ggml_context * ctx, struct ggml_cgraph * graph, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, int index, uint32_t n_tokens, int repeat) { + k_cur = ggml_rope_ext(ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, k_cur, model->head_size, model->n_kv_attn_heads, n_tokens)), octx->positions, model->rope_frequencies, + model->head_size, 2,0, 500000.0f, + 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); + + // A performance comparison between this method, i.e. performing 3 incremental copy operations in order to achieve repeat_interleave, + // and performing the repeat operation upfront before performign a single copy needs to be performed in order to better optimize this function. + // Additionally, it might be more performant for the values transposition to be performed prior to appending it to the cache, as it would save us + // from incrementally larger transpositions with generation. + for (int i = 0; i < repeat; i++) { + struct ggml_tensor * k_cache_view = ggml_view_3d( + ctx, + kv_self->k_l[index], + model->head_size, + model->n_kv_attn_heads, + n_tokens, + ggml_element_size(kv_self->k_l[index]) * model->head_size * repeat, + ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size, + ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size * octx->current_position + i * ggml_element_size(kv_self->k_l[index]) * model->head_size + ); + ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view)); + + struct ggml_tensor * v_cache_view = ggml_view_3d( + ctx, + kv_self->v_l[index], + model->head_size, + model->n_kv_attn_heads, + n_tokens, + ggml_element_size(kv_self->k_l[index]) * model->head_size * repeat, + ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size, + ggml_element_size(kv_self->k_l[index]) * model->n_attn_heads * model->head_size * octx->current_position + i * ggml_element_size(kv_self->k_l[index]) * model->head_size + ); + ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view)); + } +} + +struct ggml_cgraph * orpheus_runner::build_orpheus_graph(orpheus_ubatch & batch) { + init_build(); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + const int32_t full_sequence_length = octx->current_position + (uint32_t) batch.n_tokens; + octx->positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens); + ggml_set_input(octx->positions); + octx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens); + ggml_set_input(octx->inp_tokens); + inpL = ggml_get_rows(ctx, model->embd, octx->inp_tokens); + + struct ggml_tensor * KQ_mask_dec = build_attn_mask(ctx, octx, batch); + + for (int l = 0; l < model->n_layers; l++) { + struct ggml_tensor * residual = inpL; + cur = orpheus_build_layer_norm(ctx, inpL, model->layers[l].input_norm); + + struct ggml_tensor * attn_out; + + // self-attention + { + struct ggml_tensor * Qcur = ggml_mul_mat(ctx, model->layers[l].q, cur); + struct ggml_tensor * Kcur = ggml_mul_mat(ctx, model->layers[l].k, cur); + struct ggml_tensor * Vcur = ggml_mul_mat(ctx, model->layers[l].v, cur); + + orpheus_build_kv_store(ctx, gf, Kcur, Vcur, l, batch.n_tokens, 3); + struct ggml_tensor * k = + ggml_cont(ctx, ggml_view_3d(ctx, kv_self->k_l[l], + model->head_size, full_sequence_length, model->n_attn_heads, + ggml_element_size(kv_self->k_l[l]) * model->n_attn_heads * model->head_size, + ggml_element_size(kv_self->k_l[l]) * model->head_size, + 0)); + + struct ggml_tensor * v = + ggml_view_2d(ctx, kv_self->v_l[l], + model->hidden_size, full_sequence_length, + ggml_element_size(kv_self->k_l[l]) * model->hidden_size, + 0); + + v = ggml_cont_3d(ctx, ggml_transpose(ctx, v), full_sequence_length, model->head_size, model->n_attn_heads); + + Qcur = ggml_rope_ext( + ctx, ggml_cont(ctx, ggml_reshape_3d(ctx, Qcur, model->head_size, model->n_attn_heads, batch.n_tokens)), + octx->positions, model->rope_frequencies, model->head_size, 2, 0, 500000.0f, // rope theta + 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); + + struct ggml_tensor * q = ggml_cont(ctx, ggml_permute(ctx, Qcur, 0, 2, 1, 3)); + struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); + kq = ggml_soft_max_ext(ctx, kq, KQ_mask_dec, 1.0f/sqrtf(model->head_size), 0.0f); + struct ggml_tensor * kqv = ggml_mul_mat(ctx, kq, v); + struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 2, 0, 1, 3); + attn_out = ggml_cont_2d(ctx, kqv_merged, model->hidden_size, batch.n_tokens); + attn_out = ggml_mul_mat(ctx, model->layers[l].o, attn_out); + } + + cur = ggml_add(ctx, attn_out, residual); + + struct ggml_tensor * residualffn = cur; + + // mlp + { + cur = orpheus_build_layer_norm(ctx, cur, model->layers[l].post_attention_norm); + cur = ggml_mul(ctx, ggml_silu(ctx, ggml_mul_mat(ctx, model->layers[l].gate, cur)), ggml_mul_mat(ctx, model->layers[l].up, cur)); + cur = ggml_mul_mat(ctx, model->layers[l].down, cur); + } + cur = ggml_add(ctx, cur, residualffn); + inpL = cur; + } + + cur = orpheus_build_layer_norm(ctx, cur, model->output_norm); + // only about 40k of the output head is actually uses for generation purposes. Ideally the head tensor should be shrunk and sampled tokens should be incremented. + cur = ggml_mul_mat(ctx, model->head, cur); + if (batch.n_tokens > 1) { + cur = ggml_cont(ctx, ggml_view_1d(ctx, cur, model->vocab_size, ggml_element_size(cur) * (cur->ne[1] - 1) * model->vocab_size)); + } + ggml_build_forward_expand(gf, cur); + free_build(); + + return gf; +} + +void orpheus_runner::decode(orpheus_ubatch & batch) { + ggml_backend_sched_reset(octx->sched); + + octx->output_tokens.reserve(model->max_generation_size); + + const size_t new_size = model->vocab_size * model->max_generation_size * sizeof(float); + octx->prep_output_buffer(new_size); + + ggml_cgraph * gf = build_orpheus_graph(batch); + + // the output is always the last tensor in the graph + struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; + ggml_backend_sched_alloc_graph(octx->sched, gf); + + set_inputs(batch); + ggml_backend_sched_graph_compute_async(octx->sched, gf); + + float * logits_out = octx->logits + octx->n_outputs * model->vocab_size; + octx->get_ggml_node_data(res, logits_out, model->vocab_size * sizeof(float)); + + // update the total number of outputs retrieved and the current position + octx->current_position += batch.n_tokens; + + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(octx->sched); +} + +void orpheus_runner::set_inputs(orpheus_ubatch & batch) { + ggml_backend_tensor_set(octx->inp_tokens, batch.tokens.data(), 0, batch.tokens.size()*ggml_element_size(octx->inp_tokens)); + int32_t * pos = (int32_t*) octx->positions->data; + float * mask = (float*) octx->attn_mask->data; + uint32_t max_pos = octx->current_position + batch.n_tokens; + for (int i = 0; i < batch.n_tokens; i++) { + pos[i] = (int32_t) octx->current_position + i; + for (int ii = 0; ii < max_pos; ii++) { + mask[i*max_pos + ii] = ii > pos[i] ? -INFINITY : 0.0f; + } + } +} + +orpheus_ubatch orpheus_runner::batch_from_sentence(std::string sentence) { + struct orpheus_ubatch batch; + for (auto t : orpheus_prepended_tokens) { + batch.tokens.push_back(t); + } + if (!octx->voice.empty()) { + sentence = octx->voice + ": " + sentence; + } + tokenizer->tokenize(sentence, batch.tokens); + for (auto t : orpheus_appended_tokens) { + batch.tokens.push_back(t); + } + batch.n_tokens = batch.tokens.size(); + return batch; +} + +std::vector> orpheus_runner::prepare_output_tokens() { + size_t chunks = octx->output_tokens.size() / 7; + std::vector> output_tokens; + for (int i = 0; i < model->audio_heads; i++) { + output_tokens.push_back(std::vector{}); + } + for (int i = 0; i < chunks; i++) { + for (int ii = 0; ii < 7; ii++) { + uint32_t thead = model->heads[ii]; + // the manipulations below are not configured because they are performed inline via undocumented constants in the Orpheus codebase. + // Essentially this is how Orpheus converts discrete samples from the output shape to the audio input shape. + uint32_t t = octx->output_tokens[i*7 + ii] - 128266 - ((ii % 7) * 4096); + output_tokens[thead].push_back(t); + } + } + return output_tokens; +} + +void orpheus_runner::generate_from_batch(orpheus_ubatch & batch, struct tts_response * output) { + while ((octx->output_tokens.size() == 0 || octx->output_tokens.back() != model->stopping_token_id) && octx->output_tokens.size() < model->max_generation_size) { + decode(batch); + generation_sampler->sample(octx->logits + octx->n_outputs * model->vocab_size, octx->output_tokens); + // only increment the output count after sampling + octx->n_outputs++; + batch = orpheus_ubatch{ + 1, {octx->output_tokens.back()} + }; + } + // this case could be better addressed by adding spliting to the generation process. + if (octx->output_tokens.size() >= model->max_generation_size) { + fprintf(stdout, "Warning: generation hit its max default length. The generated audio may not contain the entire prompt.\n"); + } + std::vector> processed_output_tokens = prepare_output_tokens(); + srunner->run(processed_output_tokens, output); +} + +int orpheus_runner::generate(std::string sentence, struct tts_response * response) { + orpheus_ubatch batch = batch_from_sentence(sentence); + // it should be possible to update the max context window size, but currently it is extremely unlikely that a single prompt will + // surpass the default size. + if (batch.tokens.size() > model->max_context_length) { + TTS_ABORT("The prompt was too large for the default context window. Try splitting up or shortenning the prompt."); + } + octx->reset(); + generation_sampler->reset(); + if (!kv_self) { + orpheus_kv_cache_init(); + } + generate_from_batch(batch, response); + return 0; +} + +void orpheus_runner::configure_generation(generation_configuration * config) { + generation_sampler->temperature = config->temperature; + generation_sampler->repetition_penalty = config->repetition_penalty; + generation_sampler->do_sample = config->sample; + generation_sampler->top_k = config->top_k; + generation_sampler->top_p = config->top_p; + if (std::find(orpheus_voices.begin(), orpheus_voices.end(), config->voice) == orpheus_voices.end() && !config->voice.empty()) { + TTS_ABORT("Voice '%s' is not a valid voice for Orpheus.", config->voice.c_str()); + } + octx->voice = config->voice; +} + +orpheus_ubatch orpheus_runner::build_worst_case_batch() { + orpheus_ubatch batch; + batch.n_tokens = model->max_context_length; + return batch; +} + +void orpheus_runner::assign_weight(std::string name, ggml_tensor * tensor) { + if (tensor->data == NULL) { + return; + } + + if (name.size() == 0) { + // handles the top level meta tensor + return; + } + + if (name.size() > 5 && name.substr(0, 5) == "snac.") { + srunner->model->assign_weight(name.substr(5), tensor); + } else if (name.size() > 8 && name.substr(0, 8) == "orpheus.") { + model->assign_weight(name.substr(8), tensor); + } else { + fprintf(stdout, "Warning: function %s encountered an unhandled tensor named '%s'.\n", __func__, name.c_str()); + } +} + +void orpheus_runner::prepare_post_load() { + srunner->prepare_post_load(); + orpheus_kv_cache_init(); + auto batch = build_worst_case_batch(); + auto gf = build_orpheus_graph(batch); + octx->prep_schedule(gf); +} diff --git a/src/orpheus_model.h b/src/orpheus_model.h new file mode 100644 index 0000000..6edd36b --- /dev/null +++ b/src/orpheus_model.h @@ -0,0 +1,145 @@ +#pragma once + +#include "sampler.h" +#include "tokenizer.h" +#include "snac_model.h" + +// Orpheus uses vLLM with a llama-3 architecture. The only critical difference from the normal llama architecture is the use of kv heads. + +struct orpheus_layer { + struct ggml_tensor * input_norm; + struct ggml_tensor * post_attention_norm; + struct ggml_tensor * q; + struct ggml_tensor * k; + struct ggml_tensor * v; + struct ggml_tensor * o; + struct ggml_tensor * gate; + struct ggml_tensor * up; + struct ggml_tensor * down; +}; + +struct orpheus_model : tts_model { + uint32_t vocab_size = 156940; + uint32_t n_attn_heads = 24; + uint32_t n_kv_attn_heads = 8; + uint32_t head_size = 128; + uint32_t max_context_length = 1024; + // the generation size is technically arbitrary as the model can handle a large context. This size comes out to being 25.6 seconds. + uint32_t max_generation_size = 2100; + uint32_t stopping_token_id = 128258; + uint32_t eos_token_id = 128001; + uint32_t bos_token_id = 128000; + uint32_t hidden_size = 3072; + uint32_t kv_hidden_size = 1024; + uint32_t audio_heads = 3; + uint32_t heads[7] = {0, 1, 2, 2, 1, 2, 2}; + + int n_layers = 28; + + struct std::vector layers; + struct ggml_tensor * head; + struct ggml_tensor * embd; + struct ggml_tensor * output_norm; + struct ggml_tensor * rope_frequencies; + + void assign_weight(std::string name, ggml_tensor * tensor); + void assign_to_layer(std::string part, orpheus_layer & layer, struct ggml_tensor * tensor); + void prep_constants(gguf_context * meta); + void prep_layers(gguf_context * meta); + void setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only) { + prep_constants(meta_ctx); + prep_layers(meta_ctx); + tts_model::setup_from_file(meta_ctx, load_context, cpu_only, "orpheus", 1.30); + } +}; + +struct orpheus_context : runner_context { + orpheus_context(orpheus_model * model, int n_threads): runner_context(n_threads), model(model) {}; + struct orpheus_model * model; + + uint32_t current_position = 0; // current position in the active sequence + uint32_t n_outputs = 0; // the position of the text prompt termination (used for adjusting the cache when incrementally generating) + std::string voice; + + std::vector output_tokens; + + void reset(); + void build_schedule() { + runner_context::build_schedule(model->max_nodes()); + } + + struct ggml_tensor * inp_tokens; + struct ggml_tensor * attn_mask; + struct ggml_tensor * positions; +}; + +struct orpheus_kv_cache { + ggml_type cache_type = GGML_TYPE_F32; + + std::vector k_l; + std::vector v_l; + + struct ggml_context * ctx; + ggml_backend_buffer_type_t buft; + ggml_backend_buffer_t buf; + + void free() { + ggml_free(ctx); + ggml_backend_buffer_free(buf); + } + + ~orpheus_kv_cache() { + free(); + } +}; + +struct orpheus_context * build_new_orpheus_context(struct orpheus_model * model, int n_threads, bool use_cpu = true); + +struct orpheus_ubatch { + orpheus_ubatch() = default; + orpheus_ubatch(size_t n_tokens, std::vector tokens): n_tokens(n_tokens), tokens(tokens) {}; + size_t n_tokens; // total sentence tokens + std::vector tokens; // [n_tokens] +}; + +struct orpheus_runner : tts_runner { + orpheus_runner( + orpheus_model * model, + snac_runner * audio_decoder, + orpheus_context * octx, + bpe_tokenizer * bt, + sampler * samp, + orpheus_kv_cache * cache): model(model), srunner(audio_decoder), octx(octx), tokenizer(bt), generation_sampler(samp), kv_self(cache) { + tts_runner::sampling_rate = 24000.0f; + generation_sampler->n_output_heads = 1; + generation_sampler->vocab_size = model->vocab_size; + generation_sampler->eos_token_id = model->eos_token_id; + } + orpheus_model * model; + snac_runner * srunner; + orpheus_context * octx; + bpe_tokenizer * tokenizer; + orpheus_kv_cache * kv_self; + sampler * generation_sampler; + + void init_build() { + tts_runner::init_build(&octx->buf_compute_meta); + } + + struct ggml_cgraph * build_orpheus_graph(orpheus_ubatch & batch); + void orpheus_kv_cache_init(); + void orpheus_build_kv_store(struct ggml_context * ctx, struct ggml_cgraph * graph, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, int index, uint32_t n_tokens, int repeat); + void configure_generation(generation_configuration * config); + void assign_weight(std::string name, ggml_tensor * tensor); + std::vector> prepare_output_tokens(); + orpheus_ubatch build_worst_case_batch(); + orpheus_ubatch batch_from_sentence(std::string sentence); + void set_inputs(orpheus_ubatch & batch); + void decode(orpheus_ubatch & batch); + void prepare_post_load(); + int generate(std::string sentence, struct tts_response * response); + void generate_from_batch(orpheus_ubatch & batch, struct tts_response * output); +}; + +static struct ggml_tensor * orpheus_build_layer_norm(ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * weight); +static struct ggml_tensor * build_attn_mask(ggml_context * ctx, orpheus_context * octx, orpheus_ubatch & batch); diff --git a/src/parler_model.h b/src/parler_model.h index b200999..463910f 100644 --- a/src/parler_model.h +++ b/src/parler_model.h @@ -115,9 +115,6 @@ struct parler_context : runner_context { int32_t seq_id; // a unique identifier associated with the active sequence. std::vector output_tokens; - - size_t logits_size = 0; // capacity (of floats) for logits - float * logits = nullptr; struct ggml_tensor * inp_tokens; struct ggml_tensor * audio_inp_tokens; @@ -207,7 +204,6 @@ struct parler_tts_runner : tts_runner { tts_runner::init_build(&pctx->buf_compute_meta); } - void configure_generation(generation_configuration * config); void assign_weight(std::string name, ggml_tensor * tensor); parler_ubatch build_worst_case_batch(); diff --git a/src/snac_model.cpp b/src/snac_model.cpp new file mode 100644 index 0000000..cfe38b3 --- /dev/null +++ b/src/snac_model.cpp @@ -0,0 +1,209 @@ +#include "snac_model.h" + +void snac_model::prep_constants(gguf_context * meta) { + int heads_key = gguf_find_key(meta, "snac.audio_token_channels"); + if (heads_key != -1) { + n_heads = gguf_get_val_u32(meta, heads_key); + } + + int sampling_factor_key = gguf_find_key(meta, "snac.up_sampling_factor"); + if (sampling_factor_key != -1) { + up_sampling_factor = gguf_get_val_u32(meta, sampling_factor_key); + } + + int max_gen_key = gguf_find_key(meta, "snac.max_generation_size"); + if (max_gen_key != -1) { + max_generation_size = gguf_get_val_u32(meta, max_gen_key); + } +} + +void snac_model::prep_layers(gguf_context * meta) { + for (int i = 0; i < n_heads; i++) { + quantizer_layers.push_back(general_neural_audio_codec::residual_vector_quantize_layer{}); + } + + for (int i = 0; i < n_layers; i++) { + std::string stride_key = "snac.snac_layer_stride_" + std::to_string(i); + std::string padding_key = "snac.snac_layer_padding_" + std::to_string(i); + std::string grouping_key = "snac.snac_layer_grouping_" + std::to_string(i); + int layer_stride_key = gguf_find_key(meta, stride_key.c_str()); + if (layer_stride_key == -1) { + TTS_ABORT("key %s must be specified in gguf file inorder to initialize the SNAC audio decoder.", stride_key.c_str()); + } + int layer_padding_key = gguf_find_key(meta, padding_key.c_str()); + if (layer_padding_key == -1) { + TTS_ABORT("key %s must be specified in gguf file inorder to initialize the SNAC audio decoder.", padding_key.c_str()); + } + int layer_grouping_key = gguf_find_key(meta, grouping_key.c_str()); + if (layer_grouping_key == -1) { + TTS_ABORT("key %s must be specified in gguf file inorder to initialize the SNAC audio decoder.", grouping_key.c_str()); + } + layers.push_back( + general_neural_audio_codec::layer{ + gguf_get_val_u32(meta, layer_padding_key), + gguf_get_val_u32(meta, layer_stride_key), + gguf_get_val_u32(meta, layer_grouping_key) + } + ); + } +} + +void snac_model::assign_weight(std::string name, ggml_tensor * tensor) { + if (name == "alpha_out") { + snake_alpha = ggml_dup_tensor(ctx, tensor); + set_tensor(snake_alpha, tensor); + } else if (name == "in.weight") { + in_conv_kernel = ggml_dup_tensor(ctx, tensor); + set_tensor(in_conv_kernel, tensor); + } else if (name == "in.bias") { + in_conv_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor)); + set_tensor(in_conv_bias, tensor); + } else if (name == "up.weight") { + up_conv_kernel = ggml_dup_tensor(ctx, tensor); + set_tensor(up_conv_kernel, tensor); + } else if (name == "up.bias") { + up_conv_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor)); + set_tensor(up_conv_bias, tensor); + } else if (name == "final.weight") { + out_conv_kernel = ggml_dup_tensor(ctx, tensor); + set_tensor(out_conv_kernel, tensor); + } else if (name == "final.bias") { + out_conv_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor)); + set_tensor(out_conv_bias, tensor); + } else if (has_prefix(name, "layers")) { + auto pair = parse_layer_count(name); + int l = pair.first; + std::string lt_name = pair.second; + general_neural_audio_codec::assign_to_layer((tts_model *) this, layers[l], lt_name, tensor); + } else if (has_prefix(name, "quantizers")) { + auto pair = parse_layer_count(name); + int l = pair.first; + std::string lt_name = pair.second; + general_neural_audio_codec::assign_to_quantize_layer((tts_model *) this, quantizer_layers[l], lt_name, tensor); + } +} + +static struct ggml_tensor * snac_build_audio_inputs(struct ggml_context * ctx, struct snac_context * sctx, size_t sequence_length, std::vector layers) { + struct ggml_tensor * embd; + // these devisors represent the discreate repeats performed against each of the three input heads. + sctx->inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sequence_length / 4 + sequence_length / 2 + sequence_length); + ggml_set_input(sctx->inp_tokens); + size_t last_stride = 0; + for(int i = 0; i < sctx->model->n_heads; i++) { + auto quantize_layer = sctx->model->quantizer_layers[i]; + struct ggml_tensor * inp_head = ggml_cont(ctx, ggml_view_1d(ctx, sctx->inp_tokens, sequence_length / sctx->model->repeats[i], last_stride)); + last_stride += (sequence_length / sctx->model->repeats[i]) * ggml_element_size(sctx->inp_tokens); + struct ggml_tensor * code = general_neural_audio_codec::build_quantize_layer(ctx, inp_head, quantize_layer); + if (sctx->model->repeats[i] > 1) { + // this manipulation is equivalent to repeat_interleave against the first dimension of the tensor + code = ggml_repeat(ctx, ggml_cont_3d(ctx, code, 1, code->ne[0], code->ne[1]), ggml_new_tensor_3d(ctx, GGML_TYPE_F32, sctx->model->repeats[i], code->ne[0], sctx->model->embd)); + code = ggml_cont_2d(ctx, code, sequence_length, code->ne[2]); + } + if (i == 0) { + embd = code; + } else { + embd = ggml_add(ctx, embd, code); + } + } + return embd; +} + +snac_context * build_new_snac_context(struct snac_model * model, int n_threads, bool use_cpu) { + snac_context * sctx = new snac_context(model, n_threads); + if (!use_cpu) { +#ifdef GGML_USE_METAL + sctx->backend = ggml_backend_metal_init(); +#endif + } + sctx->backend_cpu = ggml_backend_cpu_init(); + sctx->set_threads(); + sctx->build_schedule(); + sctx->buf_compute_meta.resize(ggml_tensor_overhead()*model->max_nodes() + ggml_graph_overhead_custom(model->max_nodes(), false)); + return sctx; +} + +void snac_runner::prepare_post_load() { + ggml_cgraph * gf = build_snac_graph(model->max_generation_size); + sctx->prep_schedule(gf); +} + +struct ggml_cgraph * snac_runner::build_snac_graph(size_t sequence_length) { + init_build(); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false); + + struct ggml_tensor * cur; + struct ggml_tensor * inputs; + + sctx->noise = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model->noise_steps_sum * sequence_length); + ggml_set_input(sctx->noise); + + inputs = snac_build_audio_inputs(ctx, sctx, sequence_length, model->quantizer_layers); + cur = ggml_conv_1d_dw(ctx, model->in_conv_kernel, inputs, 1, 3, 1); + cur = ggml_add(ctx, cur, model->in_conv_bias); + cur = ggml_conv_1d(ctx, model->up_conv_kernel, cur, 1, 0, 1); + cur = ggml_add(ctx, cur, model->up_conv_bias); + size_t noise_offset = 0; + for (int l = 0; l < model->layers.size(); l++) { + auto layer = model->layers[l]; + struct ggml_tensor * noise = ggml_cont(ctx, ggml_view_1d(ctx, sctx->noise, model->noise_steps[l] * sequence_length, noise_offset)); + noise_offset += model->noise_steps[l] * sequence_length * sizeof(float); + cur = general_neural_audio_codec::build_layer(ctx, cur, layer, noise); + } + cur = snake_1d(ctx, model->snake_alpha, cur); + cur = ggml_conv_1d(ctx, model->out_conv_kernel, cur, 1, 3, 1); + cur = ggml_add(ctx, cur, model->out_conv_bias); + cur = ggml_tanh(ctx, cur); + ggml_build_forward_expand(gf, cur); + free_build(); + return gf; +} + +void snac_runner::set_inputs(std::vector> & tokens) { + ggml_backend_tensor_set( + sctx->inp_tokens, tokens[0].data(), 0, + tokens[0].size()*ggml_element_size(sctx->inp_tokens) + ); + + ggml_backend_tensor_set( + sctx->inp_tokens, tokens[1].data(), tokens[0].size() * ggml_element_size(sctx->inp_tokens), + tokens[1].size() * ggml_element_size(sctx->inp_tokens) + ); + + ggml_backend_tensor_set( + sctx->inp_tokens, tokens[2].data(), + tokens[1].size()*ggml_element_size(sctx->inp_tokens)+tokens[0].size()*ggml_element_size(sctx->inp_tokens), + tokens[2].size()*ggml_element_size(sctx->inp_tokens) + ); + size_t sequence_length = tokens[2].size(); + random_normal_gen(model->noise_steps_sum * sequence_length, (float*) sctx->noise->data); +} + +void snac_runner::run(std::vector> & tokens, struct tts_response * outputs) { + size_t sequence_length = tokens[2].size(); + ggml_backend_sched_reset(sctx->sched); + + sctx->prep_output_buffer(model->max_generation_size * model->up_sampling_factor * sizeof(float)); + + outputs->data = sctx->logits; + ggml_backend_buffer_clear(sctx->buf_output, 0); + + struct ggml_cgraph * gf = NULL; + gf = build_snac_graph(sequence_length); + + // the output is always the last tensor in the graph + struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1]; + ggml_backend_sched_alloc_graph(sctx->sched, gf); + + set_inputs(tokens); + + ggml_backend_sched_graph_compute_async(sctx->sched, gf); + + sctx->get_ggml_node_data(result, outputs->data, sequence_length*sizeof(float)*model->up_sampling_factor); + + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(sctx->sched); + outputs->n_outputs = sequence_length * model->up_sampling_factor; + return; +} + diff --git a/src/snac_model.h b/src/snac_model.h new file mode 100644 index 0000000..9450c1b --- /dev/null +++ b/src/snac_model.h @@ -0,0 +1,86 @@ +#pragma once + +#include "general_neural_audio_codec.h" + +// SNAC, Scale Neural Audio Codec, is another neural audio codec much like DAC. +// The key differences are that it uses grouping in the residual units of its layers, +// performs a repeat_interleave over the second and third input channels, applies +// a noise convolutional layer after input encoding for each layer, and applies +// an extra convolutional layer before residual layers are applied. +struct snac_model : tts_model { + // general configuration from SNAC as used by Orpheus + uint32_t n_layers = 4; + uint32_t n_heads = 3; + uint32_t up_sampling_factor = 512; + uint32_t embd = 768; + size_t max_generation_size = 2580; + uint32_t repeats[3] = {4, 2, 1}; + // configuration for adding noise + uint32_t noise_steps[4] = {8, 64, 256, 512}; + uint32_t noise_steps_sum = 840; + bool use_noise = true; + + struct ggml_tensor * repeat_interleave_buffer; + + struct ggml_tensor * in_conv_kernel; + struct ggml_tensor * in_conv_bias; + struct ggml_tensor * up_conv_kernel; + struct ggml_tensor * up_conv_bias; + struct ggml_tensor * out_conv_kernel; + struct ggml_tensor * out_conv_bias; + struct ggml_tensor * snake_alpha; + std::vector layers; + std::vector quantizer_layers; + + void assign_weight(std::string name, ggml_tensor * weight); + void prep_constants(gguf_context * meta); + void prep_layers(gguf_context * meta); + void post_load_assign(); + void setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only) { + prep_layers(meta_ctx); + prep_constants(meta_ctx); + tts_model::setup_from_file(meta_ctx, load_context, cpu_only, "snac"); + } +}; + +// the context used for running the snac model +struct snac_context : runner_context { + snac_context(snac_model * model, int n_threads): runner_context(n_threads), model(model) {}; + + struct snac_model * model; + + struct ggml_tensor * inp_tokens; + struct ggml_tensor * noise; + + void build_schedule() { + runner_context::build_schedule(model->max_nodes()); + } +}; + +snac_context * build_new_snac_context(struct snac_model * model, int n_threads, bool use_cpu = true); + +static struct ggml_tensor * snac_build_audio_inputs(struct ggml_context * ctx, struct snac_context * sctx, size_t sequence_length, std::vector layers); + +// This struct is intended to manage the snac model's graph compilation and compute function. +struct snac_runner : tts_runner { + snac_runner(snac_model * model, snac_context * context): model(model), sctx(context) {}; + ~snac_runner() { + if (ctx) { + ggml_free(ctx); + } + model->free(); + delete model; + delete sctx; + } + snac_model * model; + snac_context * sctx; + + void init_build() { + tts_runner::init_build(&sctx->buf_compute_meta); + } + + void set_inputs(std::vector> & tokens); + void prepare_post_load(); + struct ggml_cgraph * build_snac_graph(size_t sequence_length); + void run(std::vector> & tokens, struct tts_response * outputs); +}; diff --git a/src/t5_encoder_model.h b/src/t5_encoder_model.h index c155ec9..9a80187 100644 --- a/src/t5_encoder_model.h +++ b/src/t5_encoder_model.h @@ -78,9 +78,6 @@ struct t5_context : runner_context { struct t5_encoder * model; - size_t logits_size = 0; // capacity (of floats) for logits - float * logits = nullptr; - struct ggml_tensor * inp_tokens; struct ggml_tensor * positions; struct ggml_tensor * attn_mask; diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index b07ebd2..5663613 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -206,3 +206,126 @@ struct single_pass_tokenizer * single_pass_tokenizer_from_gguf(gguf_context * me return new single_pass_tokenizer(tokens); } +void bpe_symbol::add_merges(std::priority_queue, bpe_merge_comp> & merges, std::unordered_map, int, pair_hash> & rank_map, bool only_forward) { + if (!only_forward && last) { + auto rid = std::make_pair(last->as_str(), as_str()); + if (rank_map.find(rid) != rank_map.end()) { + bpe_merge m{last, this, rank_map[rid], last->size + size}; + merges.push(m); + } + } + + if (next) { + auto rid = std::make_pair(as_str(), next->as_str()); + if (rank_map.find(rid) != rank_map.end()) { + bpe_merge m{this, next, rank_map[rid], size + next->size}; + merges.push(m); + } + } +} + +std::string bpe_symbol::as_str() { + return std::string(token, size); +} + +bool bpe_merge_comp::operator() (const bpe_merge & a, const bpe_merge & b) { + return a.rank > b.rank || (a.rank == b.rank && a.a && b.a && a.a->pos > b.a->pos); +} + +size_t pair_hash::operator() (const std::pair & p) const { + return std::hash{}(p.first) ^ (std::hash{}(p.second) << 1); +} + +bpe_symbol * bpe_merge::merge() { + a->size += b->size; + b->size = -1; + a->next = b->next; + if (a->next) { + a->next->last = a; + } + return a; +} + +void pair_builder::join_pairs(std::unordered_map, int, pair_hash> & rank_map) { + std::priority_queue, bpe_merge_comp> merges; + for (auto part : parts) { + part->add_merges(merges, rank_map, true); + } + while (!merges.empty()) { + auto m = merges.top(); + merges.pop(); + if (m.a->size > 0 && m.b->size > 0 && m.new_size == m.a->size + m.b->size) { + m.merge(); + m.a->add_merges(merges, rank_map); + } + + } +} + +void bpe_tokenizer::tokenize(const std::string & text, std::vector & token_ids) { + std::vector chunks = split(text, " ", true); + bool space_prior = false; + for (auto chunk : chunks) { + if (chunk != " ") { + bpe_tokenize(space_prior ? "Ġ" + chunk : chunk, token_ids); + } else { + space_prior = true; + } + } +} + +void bpe_tokenizer::bpe_tokenize(std::string chunk, std::vector & token_ids) { + if (tokens_to_ids.find(chunk) != tokens_to_ids.end()) { + token_ids.push_back(tokens_to_ids[chunk]); + return; + } + auto pb = pair_builder{chunk}; + pb.join_pairs(ranks); + bpe_symbol * next = pb.parts[0]; + while (next) { + token_ids.push_back(tokens_to_ids[next->as_str()]); + next = next->next; + } +} + +bpe_tokenizer * bpe_tokenizer_from_gguf(gguf_context * meta, std::string base_name) { + int vocab_key = gguf_find_key(meta, (base_name + ".tokens").c_str()); + if (vocab_key == -1) { + TTS_ABORT("The '%s' key must be set in order to support BPE tokenization.", (base_name + ".tokens").c_str()); + } + int merges_key = gguf_find_key(meta, (base_name + ".merges").c_str()); + if (merges_key == -1) { + TTS_ABORT("The '%s' key must be set in order to support BPE tokenization.", (base_name + ".merges").c_str()); + } + int eos_token_id_key = gguf_find_key(meta, (base_name + ".eos_token_id").c_str()); + if (eos_token_id_key == -1) { + TTS_ABORT("The '%s' key must be set in order to support BPE tokenization.", (base_name + ".eos_token_id").c_str()); + } + int bos_token_id_key = gguf_find_key(meta, (base_name + ".bos_token_id").c_str()); + if (bos_token_id_key == -1) { + TTS_ABORT("The '%s' key must be set in order to support BPE tokenization.", (base_name + ".bos_token_id").c_str()); + } + + uint32_t bos_token_id = gguf_get_val_u32(meta, bos_token_id_key); + uint32_t eos_token_id = gguf_get_val_u32(meta, eos_token_id_key); + + std::unordered_map vocab; + int token_count = gguf_get_arr_n(meta, vocab_key); + for (int i = 0; i < token_count; i++) { + vocab[gguf_get_arr_str(meta, vocab_key, i)] = (uint32_t) i; + } + + std::unordered_map, int, pair_hash> ranks; + int merge_count = gguf_get_arr_n(meta, merges_key); + + for (int i = 0; i < merge_count; i++) { + auto raw_merge = gguf_get_arr_str(meta, merges_key, i); + std::vector pair = split(raw_merge, " "); + if (pair.size() != 2) { + TTS_ABORT("Invalid pair, '%s', found in BPE merges, '%s', at index %d.", raw_merge, (base_name + ".merges").c_str(), i); + } + ranks[std::make_pair<>(pair[0], pair[1])] = i; + } + + return new bpe_tokenizer(vocab, ranks, bos_token_id, eos_token_id); +} diff --git a/src/tokenizer.h b/src/tokenizer.h index 964d6f9..6216340 100644 --- a/src/tokenizer.h +++ b/src/tokenizer.h @@ -6,6 +6,7 @@ #include #include #include +#include #include "util.h" struct token_trie { @@ -74,4 +75,80 @@ struct single_pass_tokenizer { single_pass_tokenizer * single_pass_tokenizer_from_gguf(gguf_context * meta, std::string key_name = "phonemizer.graphemes"); +struct bpe_symbol; + +struct bpe_merge { + bpe_symbol * a; + bpe_symbol * b; + int rank; + int new_size; + + bpe_symbol * merge(); +}; + +struct bpe_merge_comp{ + bool operator() (const bpe_merge & a, const bpe_merge & b); +}; + +struct pair_hash { + size_t operator() (const std::pair & p) const; +}; + +struct bpe_symbol { + bpe_symbol(const char * token): token(token) {}; + const char* token; + int size = 1; + int pos; + bpe_symbol * next = nullptr; + bpe_symbol * last = nullptr; + + void add_merges(std::priority_queue, bpe_merge_comp> & merges, std::unordered_map, int, pair_hash> & rank_map, bool only_forward = false); + std::string as_str(); +}; + +struct pair_builder { + pair_builder(std::string word) { + bpe_symbol * last = nullptr; + for (int i = 0; i < word.size(); i++) { + int increment = 0; + // make sure we process each utf-8 character. + while(i + increment + 1 < word.size() && (word[i+increment+1] & 0b11000000) == 0b10000000) { + ++increment; + } + bpe_symbol * part = new bpe_symbol(word.data()+i); + part->pos = i; + part->size += increment; + i += increment; + if (last) { + last->next = part; + part->last = last; + } + last = part; + parts.push_back(part); + } + } + + ~pair_builder() { + for (auto p : parts) { + delete p; + } + } + + void join_pairs(std::unordered_map, int, pair_hash> & rank_map); + std::vector parts; +}; + +struct bpe_tokenizer { + bpe_tokenizer(std::unordered_map & tokens_to_ids, std::unordered_map, int, pair_hash> & ranks, uint32_t bos, uint32_t eos): tokens_to_ids(tokens_to_ids), ranks(ranks), eos_token_id(eos), bos_token_id(bos) {}; + std::unordered_map tokens_to_ids; + std::unordered_map, int, pair_hash> ranks; + uint32_t eos_token_id; + uint32_t bos_token_id; + + void tokenize(const std::string & text, std::vector & token_ids); + void bpe_tokenize(std::string chunk, std::vector & token_ids); +}; + +bpe_tokenizer * bpe_tokenizer_from_gguf(gguf_context * meta, std::string base_name = "tokenizer.ggml"); + #endif diff --git a/src/tts.cpp b/src/tts.cpp index d426dae..348144e 100644 --- a/src/tts.cpp +++ b/src/tts.cpp @@ -10,6 +10,32 @@ static constexpr std::array DURATION_PREDICTOR_QUANTIZATION_COM "layers" }; +struct tts_runner * orpheus_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) { + orpheus_model * model = new orpheus_model; + snac_model * audio_model = new snac_model; + bpe_tokenizer * bt = bpe_tokenizer_from_gguf(meta_ctx); + model->setup_from_file(meta_ctx, weight_ctx, cpu_only); + audio_model->setup_from_file(meta_ctx, weight_ctx, cpu_only); + sampler * samp = new sampler; + snac_context * sctx = build_new_snac_context(audio_model, n_threads, cpu_only); + snac_runner * audio_decoder = new snac_runner(audio_model, sctx); + orpheus_context * octx = build_new_orpheus_context(model, n_threads, cpu_only); + orpheus_kv_cache * cache = new orpheus_kv_cache; + orpheus_runner * runner = new orpheus_runner(model, audio_decoder, octx, bt, samp, cache); + + for (ggml_tensor * cur = ggml_get_first_tensor(weight_ctx); cur; cur = ggml_get_next_tensor(weight_ctx, cur)) { + runner->assign_weight(cur->name, cur); + } + + runner->prepare_post_load(); + + gguf_free(meta_ctx); + ggml_free(weight_ctx); + runner->arch = arch; + + return (tts_runner*)runner; +} + struct tts_runner * parler_tts_from_file(gguf_context * meta_ctx, ggml_context * weight_ctx, int n_threads, generation_configuration * config, tts_arch arch, bool cpu_only) { parler_tts_model * model = new parler_tts_model; dac_model * audio_model = new dac_model; @@ -125,6 +151,8 @@ struct tts_runner * runner_from_file(const std::string & fname, int n_threads, g return kokoro_from_file(meta_ctx, weight_ctx, n_threads, config, arch_type, cpu_only); case DIA_ARCH: return dia_from_file(meta_ctx, weight_ctx, n_threads, config, arch_type, cpu_only); + case ORPHEUS_ARCH: + return orpheus_from_file(meta_ctx, weight_ctx, n_threads, config, arch_type, cpu_only); default: TTS_ABORT("%s failed for file %s. The architecture '%s' is not supported.", __func__, fname.c_str(), arch.c_str()); } @@ -140,6 +168,9 @@ int generate(tts_runner * runner, std::string sentence, struct tts_response * re case DIA_ARCH: ((dia_runner*)runner)->configure_generation(config); return ((dia_runner*)runner)->generate(sentence, response); + case ORPHEUS_ARCH: + ((orpheus_runner*)runner)->configure_generation(config); + return ((orpheus_runner*)runner)->generate(sentence, response); default: TTS_ABORT("%s failed. The architecture '%d' is not supported.", __func__, runner->arch); } diff --git a/src/tts_model.cpp b/src/tts_model.cpp index b6cad74..8fb8412 100644 --- a/src/tts_model.cpp +++ b/src/tts_model.cpp @@ -67,6 +67,19 @@ bool runner_context::prep_schedule(struct ggml_cgraph * gf) { return ggml_backend_sched_reserve(sched, gf); } +void runner_context::prep_output_buffer(size_t new_size) { + const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output) : 0; + if (!buf_output || prev_size < new_size) { + if (buf_output) { + ggml_backend_buffer_free(buf_output); + buf_output = nullptr; + logits = nullptr; + } + buf_output = ggml_backend_buft_alloc_buffer(backend_cpu_buffer, new_size); + } + logits = (float *) ggml_backend_buffer_get_base(buf_output); +} + void tts_runner::init_build(std::vector* buf_compute_meta) { struct ggml_init_params params = { /*.mem_size =*/ buf_compute_meta->size(), diff --git a/src/tts_model.h b/src/tts_model.h index 6eb59de..93d0a21 100644 --- a/src/tts_model.h +++ b/src/tts_model.h @@ -11,7 +11,7 @@ void append_to_response(struct tts_response * response, struct tts_response * to using tensor_meta_callback = std::function*; struct runner_context { - runner_context(int n_threads): n_threads(n_threads) {}; + runner_context(int n_threads): n_threads(n_threads) {}; virtual ~runner_context() { ggml_backend_sched_free(sched); ggml_threadpool_free(threadpool); @@ -30,16 +30,18 @@ struct runner_context { ggml_backend_buffer_t buf_output = nullptr; ggml_backend_sched_t sched = nullptr; ggml_threadpool_t threadpool = nullptr; + float * logits = nullptr; int n_threads; void get_ggml_node_data(struct ggml_tensor * output_tensor, float * output, size_t output_size, ggml_backend_buffer_t buffer = nullptr); void set_threads(); void build_schedule(size_t max_nodes); bool prep_schedule(ggml_cgraph * gf); + void prep_output_buffer(size_t new_size); }; struct tts_model { - struct model_tensor_meta tensor_meta; + struct model_tensor_meta tensor_meta; // this is the current byte offset into the model's buffer. size_t offset = 0; @@ -56,7 +58,7 @@ struct tts_model { struct ggml_context * ctx; - void prep_buffers_and_context(bool cpu_only, float size_offset, uint32_t dedicated_add_on_size); + void prep_buffers_and_context(bool cpu_only, float size_offset, uint32_t dedicated_add_on_size); void setup_from_file(gguf_context * meta_ctx, ggml_context * load_context, bool cpu_only, std::string model_prefix, float size_offset = 1.4, uint32_t dedicated_add_on_size = 0); void set_tensor(struct ggml_tensor * tensor, struct ggml_tensor * target); size_t max_nodes(); diff --git a/src/util.cpp b/src/util.cpp index a5bbb4b..9068c70 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -63,7 +63,7 @@ int search_for_gguf_keys(gguf_context * meta, std::vector possible_ return gguf_key; } -void random_gen(int count, float * tgt, float min, float max) { +void random_uniform_gen(int count, float * tgt, float min, float max) { static std::default_random_engine e; static std::uniform_real_distribution dis(min, max); for (int i = 0; i < count; i++) { @@ -71,6 +71,14 @@ void random_gen(int count, float * tgt, float min, float max) { } } +void random_normal_gen(int count, float * tgt, float mean, float std) { + static std::default_random_engine e; + static std::normal_distribution dis(mean, std); + for (int i = 0; i < count; i++) { + tgt[i] = dis(e); + } +} + float round_to_float(double v) { return roundf(v * powl(10, 6)) / powl(10, 6); } @@ -220,6 +228,11 @@ std::vector split(std::string target, std::string split_on, bool in output.push_back(target.substr(i, 1)); } last = i+1; + } else if (i == last && split_on.find(target[i]) != std::string::npos) { + if (include_split_characters) { + output.push_back(target.substr(i, 1)); + } + last = i+1; } } if (last < target.size()) { @@ -242,6 +255,11 @@ std::vector split(std::string target, const char split_on, bool inc output.push_back(target.substr(i, 1)); } last = i+1; + } else if (i == last && split_on == target[i]) { + if (include_split_characters) { + output.push_back(target.substr(i, 1)); + } + last = i+1; } } if (last < target.size()) { diff --git a/src/util.h b/src/util.h index 458d080..681111e 100644 --- a/src/util.h +++ b/src/util.h @@ -26,7 +26,12 @@ struct model_tensor_meta { size_t n_bytes = 0; }; -void random_gen(int count, float * tgt, float min = 0.0f, float max = 1.0); +/** + * Both of these random fill the tgt array with count random floating point values. + * the default parameter values are consistent with pytorch random function defaults. + */ +void random_uniform_gen(int count, float * tgt, float min = 0.0f, float max = 1.0f); +void random_normal_gen(int count, float * tgt, float mean = 0.0f, float std = 1.0f); std::pair parse_layer_count(std::string name, int skip = 0);