diff --git a/.github/workflows/crowdin.yml b/.github/workflows/crowdin.yml index 64d80fa56..d6b78e3f7 100644 --- a/.github/workflows/crowdin.yml +++ b/.github/workflows/crowdin.yml @@ -1,4 +1,4 @@ -name: Crowdin Action +name: THAI CVVC-VCCV on: workflow_dispatch: {} diff --git a/DiffSinger_colab_notebook.ipynb b/DiffSinger_colab_notebook.ipynb new file mode 100644 index 000000000..f478fbc14 --- /dev/null +++ b/DiffSinger_colab_notebook.ipynb @@ -0,0 +1,1945 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "collapsed_sections": [ + "MP5rRkbTpnG8", + "Wv0gfI5feBSc", + "eexZl_OCDmQ3", + "0J3b18EKdzMC", + "FY40fGHEg9_i", + "4sbU1aH5kGFE" + ], + "gpuType": "T4", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MP5rRkbTpnG8" + }, + "source": [ + "# _**[DiffSinger](https://github.com/openvpi/DiffSinger)**_\n", + "_Singing Voice Synthesis via Shallow Diffusion Mechanism (SVS & TTS)_\n", + "\n", + "\\\n", + "____\n", + "\n", + "Note:\n", + "- This notebook will get update semi-frequently based from the feedback or response from users\n", + "- Make sure to compare your file structure to the [data example](https://github.com/usamireko/DiffSinger_colab_notebook_MLo7/blob/main/data_example.md)\n", + "\n", + "```We refer \"variance\" as \"parameters\" to avoid the confusion```\n", + "\n", + "```Use export_mode if only wanting to export your ONNX files and nothing more```\n", + "\n", + "\\\n", + "____\n", + "\\\n", + "#### **This notebook is an edited copy of Kei's Diffsinger [colab notebook](https://colab.research.google.com/drive/1kUg9dz8PPH92NfnLZwgq0_9B9an39t1J?usp=sharing)**\n", + "####**This notebook is maintained by MLo7**\n", + "\n", + "___" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# **Setup**" + ], + "metadata": { + "id": "Wv0gfI5feBSc" + } + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "pK8aicf8A2sj", + "cellView": "form", + "collapsed": true, + "colab": { + "base_uri": "https://localhost:8080/", + "height": 630 + }, + "outputId": "bcc59e57-72c6-4170-a950-e505e46eef54" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "setup complete!\n", + "|\n", + "|\n", + "|\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "" + ] + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {}, + "execution_count": 1 + } + ], + "source": [ + "from IPython.display import clear_output, Audio, display, HTML\n", + "import os\n", + "from google.colab import drive\n", + "\n", + "def setup_onnx_export():\n", + " print(\"ONNX Export Mode Enabled, Installing required components\")\n", + " !git clone https://github.com/openvpi/DiffSinger.git /content/DiffSinger\n", + " !wget -O /content/mini.sh https://repo.anaconda.com/miniconda/Miniconda3-py310_25.1.1-2-Linux-x86_64.sh\n", + " !chmod +x /content/mini.sh\n", + " !bash /content/mini.sh -b -f -p /usr/local\n", + " !conda install -q -y jupyter\n", + " !conda install -q -y google-colab -c conda-forge\n", + " !python -m ipykernel install --name \"py310\" --user\n", + " print(\"installing dependencies for ONNX conversion\")\n", + " !pip install -r /content/DiffSinger/requirements-onnx.txt -q -q -q 2>/dev/null\n", + " print(\"Installation complete, time to export those ONNX!\")\n", + "\n", + "def setup_standard():\n", + " if not os.path.exists(\"/content/pretrain_models\"):\n", + " os.makedirs(\"/content/pretrain_models\")\n", + "\n", + " !wget https://github.com/MLo7Ghinsan/DiffSinger_colab_notebook_MLo7/releases/download/OU_files/jpn_dict.txt -O /content/jpn_dict.txt\n", + " !rm -rf /content/sample_data\n", + " !apt-get install aria2\n", + " clear_output()\n", + " !git clone https://github.com/UtaUtaUtau/nnsvs-db-converter /content/nnsvs-db-converter\n", + " !git clone https://github.com/openvpi/DiffSinger.git /content/DiffSinger\n", + " !git clone https://github.com/openvpi/MakeDiffSinger /content/MakeDiffSinger\n", + " !git clone https://github.com/MLo7Ghinsan/ghin_shenanigans /content/ghin_shenanigans\n", + " !git clone https://github.com/openvpi/SOME /content/SOME\n", + " clear_output()\n", + " !pip install torch torchvision torchaudio\n", + " clear_output()\n", + " !pip install -r /content/DiffSinger/requirements.txt\n", + " !pip install -r /content/SOME/requirements.txt\n", + " !pip install mido einops\n", + " clear_output()\n", + " !wget https://github.com/openvpi/vocoders/releases/download/nsf-hifigan-44.1k-hop512-128bin-2024.02/nsf_hifigan_44.1k_hop512_128bin_2024.02.zip -O /content/nsf_hifigan_44.1k_hop512_128bin_2024.02.zip\n", + " !wget https://github.com/openvpi/vocoders/releases/download/pc-nsf-hifigan-44.1k-hop512-128bin-2025.02/pc_nsf_hifigan_44.1k_hop512_128bin_2025.02.zip -O /content/pc_nsf_hifigan_44.1k_hop512_128bin_2025.02.zip\n", + " !wget https://github.com/openvpi/DiffSinger/releases/download/v2.1.0/rmvpe.zip -O /content/rmvpe.zip\n", + " !wget https://github.com/openvpi/SOME/releases/download/v1.0.0-baseline/0119_continuous128_5spk.zip -O /content/0119_continuous128_5spk.zip\n", + " !wget https://github.com/yxlllc/vocal-remover/releases/download/hnsep_240512/hnsep_240512.zip -O /content/DiffSinger/checkpoints/hnsep_240512.zip\n", + " !unzip -q /content/DiffSinger/checkpoints/hnsep_240512.zip -d /content/DiffSinger/checkpoints\n", + " !unzip -q /content/0119_continuous128_5spk.zip -d /content/DiffSinger/checkpoints/SOME\n", + " !unzip -q /content/pc_nsf_hifigan_44.1k_hop512_128bin_2025.02.zip -d /content/DiffSinger/checkpoints\n", + " !unzip -q /content/nsf_hifigan_44.1k_hop512_128bin_2024.02.zip -d /content/DiffSinger/checkpoints\n", + " !unzip -q /content/rmvpe.zip -d /content/DiffSinger/checkpoints\n", + " !unzip -q /content/rmvpe.zip -d /content/MakeDiffSinger/variance-temp-solution/assets\n", + " !rm /content/nsf_hifigan_44.1k_hop512_128bin_2024.02.zip\n", + " !rm /content/rmvpe.zip\n", + " !rm /content/0119_continuous128_5spk.zip\n", + " !aria2c -d /content/pretrain_models -o acoustic_pretrain.ckpt https://github.com/haru0l/diffsinger_models/releases/download/acoustic/model_ckpt_steps_49000.ckpt\n", + " !aria2c -d /content/pretrain_models -o variance_pretrain.ckpt https://github.com/haru0l/diffsinger_models/releases/download/variance/model_ckpt_steps_51000.ckpt\n", + " clear_output()\n", + " !pip install --upgrade tensorboard\n", + " clear_output()\n", + " !pip install protobuf\n", + " clear_output()\n", + " !pip install onnxruntime\n", + " clear_output()\n", + " !pip install pydub\n", + " clear_output()\n", + "\n", + "#@title # Mount Google Drive and Setup\n", + "export_mode = False # @param {\"type\":\"boolean\"}\n", + "drive.mount(\"/content/drive\")\n", + "\n", + "if export_mode:\n", + " setup_onnx_export()\n", + "else:\n", + " setup_standard()\n", + "\n", + "clear_output()\n", + "print(\"setup complete!\")\n", + "print(\"|\")\n", + "print(\"|\")\n", + "print(\"|\")\n", + "!git clone https://github.com/MLo7Ghinsan/ghin_shenanigans /content/ghin_shenanigans 2>/dev/null\n", + "chika_dance = ''\n", + "display(HTML(chika_dance))\n", + "with open(\"/content/ghin_shenanigans/audio/setup_complete.wav\", \"rb\") as f:\n", + " setup_complete_sound = f.read()\n", + "Audio(data=setup_complete_sound, autoplay=True)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# **Preprocess data for training**" + ], + "metadata": { + "id": "eexZl_OCDmQ3" + } + }, + { + "cell_type": "code", + "source": [ + "#@title #Extract Data\n", + "#@markdown ___\n", + "%cd /content\n", + "#@markdown this cell will create a folder name [raw_data] in the root folder of colab (/content) and extract your data into it\n", + "\n", + "data_type = \"ds (DiffSinger format)\" # @param [\"lab + wav (NNSVS format)\", \"csv + wav (DiffSinger format)\", \"ds (DiffSinger format)\"]\n", + "\n", + "#@markdown The path to your data zip file\n", + "\n", + "data_zip_path = \"/content/drive/MyDrive/dataset/Kochujang.zip\" #@param {type:\"string\"}\n", + "\n", + "#@markdown ___\n", + "\n", + "#@markdown nnsvs-db-converter settings (lab + wav ONLY)\n", + "\n", + "#@markdown _These values can exceed the amount that's in your data to maximize the segment length or to keep the data as is_\n", + "\n", + "#@markdown This option is necessary for variance's pitch training\n", + "estimate_midi_option = \"False\" # @param [\"False\", \"True | parselmouth\", \"True | harvest\", \"True | SOME\"]\n", + "if estimate_midi_option == \"True | parselmouth\":\n", + " estimate_midi = True\n", + " midi_pitch_ext = \"parselmouth\"\n", + "elif estimate_midi_option == \"True | harvest\":\n", + " estimate_midi = True\n", + " midi_pitch_ext = \"harvest\"\n", + "else:\n", + " estimate_midi = False\n", + " midi_pitch_ext = None\n", + "#@markdown Determine how long it will segment your data to based on silence phoneme placement (seconds)\n", + "segment_length = 12 #@param {type:\"slider\", min:5, max:35, step:1}\n", + "\n", + "#@markdown Determine how many silence phoneme is allowed in the middle of each segment\n", + "max_silence_phoneme_amount = 2 #@param {type:\"slider\", min:0, max:50, step:1}\n", + "\n", + "# leaving -S at 60 so max silence can be 60 seconds that exceeds the segment legnth cap idk why///\n", + "# making the segment length cap at 35 secs because any longer than that would make training goes really slow\n", + "\n", + "# my ass dont remember why i made two... i think one is unnecessary extra but mehhh\n", + "all_shits = \"/content/raw_data\"\n", + "all_shits_not_wav_n_lab = \"/content/raw_data/diffsinger_db\"\n", + "\n", + "import os\n", + "import csv\n", + "import json\n", + "import shutil\n", + "from pydub import AudioSegment\n", + "import yaml\n", + "\n", + "if os.path.exists(\"/content/raw_data\"):\n", + " shutil.rmtree(\"/content/raw_data\")\n", + "\n", + "if not os.path.exists(all_shits_not_wav_n_lab):\n", + " os.makedirs(all_shits_not_wav_n_lab)\n", + "\n", + "# using 'if not' bc i edited the wrong section which im also too lazy to fix it <3\n", + "if not data_type == \"lab + wav (NNSVS format)\":\n", + " #changed to 7zip to support more compression types\n", + " !7z x \"$data_zip_path\" -o{all_shits_not_wav_n_lab}\n", + " for root, dirs, files in os.walk(all_shits):\n", + " for filename in files:\n", + " if filename.endswith(\".lab\"):\n", + " file_path = os.path.join(root, filename)\n", + " with open(file_path, \"r\") as file:\n", + " file_data = file.read()\n", + " file_data = file_data.replace(\"SP\", \"pau\")\n", + " file_data = file_data.replace(\"br\", \"AP\")\n", + " with open(file_path, \"w\") as file:\n", + " file.write(file_data)\n", + "\n", + "else:\n", + " !7z x \"$data_zip_path\" -o{all_shits_not_wav_n_lab}\n", + "\n", + "\n", + "# for funny auto dict generator lmao\n", + "out = \"/content/DiffSinger/dictionaries\"\n", + "dictionary_files = []\n", + "dictionary_conf_lines = []\n", + "\n", + "def is_excluded(phoneme):\n", + " return phoneme in [\"pau\", \"AP\", \"SP\", \"sil\"]\n", + "\n", + "lang_config_path = all_shits_not_wav_n_lab +\"/lang_config.yaml\"\n", + "\n", + "if not os.path.exists(lang_config_path):\n", + " extra_phonemes = []\n", + " merged_phoneme_groups = []\n", + " all_phonemes = set()\n", + "\n", + " for root, dirs, files in os.walk(all_shits_not_wav_n_lab):\n", + " for file in files:\n", + " fpath = os.path.join(root, file)\n", + " # honestly if people still have whatever/phoneme in their single dict, they shouldnt be doing single dict in the first place\n", + " if file.endswith(\".lab\"):\n", + " with open(fpath, \"r\") as lab_file:\n", + " for line in lab_file:\n", + " parts = line.strip().split()\n", + " if len(parts) < 3:\n", + " continue\n", + " phoneme = parts[2]\n", + " if \"/\" in phoneme:\n", + " _, phoneme = phoneme.split(\"/\", 1)\n", + " if not is_excluded(phoneme):\n", + " all_phonemes.add(phoneme)\n", + "\n", + " elif file.endswith(\".csv\"):\n", + " with open(fpath, \"r\", newline=\"\") as csv_file:\n", + " csv_reader = csv.DictReader(csv_file)\n", + " for row in csv_reader:\n", + " if \"ph_seq\" in row:\n", + " for phoneme in row[\"ph_seq\"].strip().split():\n", + " if \"/\" in phoneme:\n", + " _, phoneme = phoneme.split(\"/\", 1)\n", + " if not is_excluded(phoneme):\n", + " all_phonemes.add(phoneme)\n", + "\n", + " elif file.endswith(\".ds\"):\n", + " with open(fpath, \"r\") as json_file:\n", + " data = json.load(json_file)\n", + " for entry in data:\n", + " if \"ph_seq\" in entry:\n", + " for phoneme in entry[\"ph_seq\"].strip().split():\n", + " if \"/\" in phoneme:\n", + " _, phoneme = phoneme.split(\"/\", 1)\n", + " if not is_excluded(phoneme):\n", + " all_phonemes.add(phoneme)\n", + "\n", + " os.makedirs(out, exist_ok=True)\n", + " custom_dict_path = os.path.join(out, \"dictionary-custom.txt\")\n", + " dictionary_files.append(custom_dict_path)\n", + " dictionary_conf_lines.append(f\"custom: '{custom_dict_path}'\")\n", + " with open(custom_dict_path, \"w\", encoding=\"utf-8\") as out_file:\n", + " for phoneme in sorted(all_phonemes):\n", + " out_file.write(f\"{phoneme}\\t{phoneme}\\n\")\n", + " lang_dict = None\n", + "\n", + "else:\n", + " with open(lang_config_path, \"r\") as yaml_file:\n", + " lang_config = yaml.safe_load(yaml_file)\n", + "\n", + " languages = lang_config.get(\"languages\", [])\n", + " extra_phonemes = lang_config.get(\"extra_phonemes\", [])\n", + " merged_phoneme_groups = lang_config.get(\"merged_phoneme_groups\", [])\n", + "\n", + " lang_dict = {lang: set() for lang in languages}\n", + "\n", + " for folder in os.listdir(all_shits_not_wav_n_lab):\n", + " if \".\" in folder:\n", + " _, lang_code = folder.rsplit(\".\", 1)\n", + " if lang_code not in languages:\n", + " continue\n", + "\n", + " phoneme_folder_path = os.path.join(all_shits_not_wav_n_lab, folder)\n", + "\n", + " for root, dirs, files in os.walk(phoneme_folder_path):\n", + " for file in files:\n", + " fpath = os.path.join(root, file)\n", + "\n", + " if data_type == \"lab + wav (NNSVS format)\":\n", + " if file.endswith(\".lab\"):\n", + " with open(fpath, \"r\") as lab_file:\n", + " for line in lab_file:\n", + " line = line.strip()\n", + " if not line:\n", + " continue\n", + " parts = line.split()\n", + " if len(parts) < 3:\n", + " continue\n", + " phoneme = parts[2]\n", + " if \"/\" in phoneme:\n", + " lang_hint, actual_phoneme = phoneme.split(\"/\", 1)\n", + " if lang_hint in languages and not is_excluded(actual_phoneme):\n", + " lang_dict[lang_hint].add(actual_phoneme)\n", + " continue\n", + " if not is_excluded(phoneme):\n", + " lang_dict[lang_code].add(phoneme)\n", + "\n", + " elif data_type == \"csv + wav (DiffSinger format)\":\n", + " if file.endswith(\".csv\"):\n", + " with open(fpath, \"r\", newline=\"\") as csv_file:\n", + " csv_reader = csv.DictReader(csv_file)\n", + " for row in csv_reader:\n", + " if \"ph_seq\" in row:\n", + " ph_seq = row[\"ph_seq\"].strip()\n", + " for phoneme in ph_seq.split():\n", + " if \"/\" in phoneme:\n", + " lang_hint, actual_phoneme = phoneme.split(\"/\", 1)\n", + " if lang_hint in languages and not is_excluded(actual_phoneme):\n", + " lang_dict[lang_hint].add(actual_phoneme)\n", + " continue\n", + " if not is_excluded(phoneme):\n", + " lang_dict[lang_code].add(phoneme)\n", + "\n", + " else:\n", + " if file.endswith(\".ds\"):\n", + " with open(fpath, \"r\") as json_file:\n", + " data = json.load(json_file)\n", + " for entry in data:\n", + " if \"ph_seq\" in entry:\n", + " ph_seq = entry[\"ph_seq\"].strip()\n", + " for phoneme in ph_seq.split():\n", + " if \"/\" in phoneme:\n", + " lang_hint, actual_phoneme = phoneme.split(\"/\", 1)\n", + " if lang_hint in languages and not is_excluded(actual_phoneme):\n", + " lang_dict[lang_hint].add(actual_phoneme)\n", + " continue\n", + " if not is_excluded(phoneme):\n", + " lang_dict[lang_code].add(phoneme)\n", + "\n", + " for lang, ph_set in lang_dict.items():\n", + " output_path = os.path.join(out, f\"dictionary-{lang}.txt\")\n", + " dictionary_files.append(output_path)\n", + " dictionary_conf_lines.append(f\"{lang}: '{output_path}'\")\n", + " with open(output_path, \"w\", encoding=\"utf-8\") as out_file:\n", + " for phoneme in sorted(ph_set):\n", + " out_file.write(f\"{phoneme}\\t{phoneme}\\n\")\n", + "\n", + "# used this for check runs\n", + "#for dicks in dictionary_files:\n", + "# print(dicks)\n", + "\n", + "# for vowels and consonants.txt.... well adding luquid type for uta's script\n", + "dict_path = out\n", + "vowel_types = {\"a\", \"i\", \"u\", \"e\", \"o\", \"N\", \"M\", \"NG\"}\n", + "liquid_types = {\"y\", \"w\", \"l\", \"r\"} # r for english labels, it should be fine with jp too\n", + "vowel_data = []\n", + "consonant_data = []\n", + "liquid_data = []\n", + "\n", + "for dict_path in dictionary_files:\n", + " with open(dict_path, \"r\") as f:\n", + " for line in f:\n", + " phoneme, _ = line.strip().split(\"\\t\")\n", + " if phoneme[0] in vowel_types:\n", + " vowel_data.append(phoneme)\n", + " elif phoneme[0] in liquid_types:\n", + " liquid_data.append(phoneme)\n", + " else:\n", + " consonant_data.append(phoneme)\n", + "\n", + "vowel_data.sort()\n", + "liquid_data.sort()\n", + "consonant_data.sort()\n", + "directory = os.path.dirname(dict_path)\n", + "\n", + "# make txt for language json file\n", + "vowel_txt_path = os.path.join(directory, \"vowels.txt\")\n", + "with open(vowel_txt_path, \"w\") as f:\n", + " f.write(\" \".join(vowel_data))\n", + "liquid_txt_path = os.path.join(directory, \"liquids.txt\")\n", + "with open(liquid_txt_path, \"w\") as f:\n", + " f.write(\" \".join(liquid_data))\n", + "consonant_txt_path = os.path.join(directory, \"consonants.txt\")\n", + "with open(consonant_txt_path, \"w\") as f:\n", + " f.write(\" \".join(consonant_data))\n", + "\n", + "\n", + "# here's a funny json append\n", + "with open(vowel_txt_path, \"r\") as f:\n", + " vowel_data = f.read().split()\n", + "with open(liquid_txt_path, \"r\") as f:\n", + " liquid_data = f.read().split()\n", + "with open(consonant_txt_path, \"r\") as f:\n", + " consonant_data = f.read().split()\n", + "liquid_list = {liquid: True for liquid in liquid_data} #temp fix, might need more research about the push in timing'''\n", + "phones4json = {\"vowels\": vowel_data, \"liquids\": liquid_list}\n", + "with open(\"/content/nnsvs-db-converter/lang.sample.json\", \"w\") as rawr:\n", + " json.dump(phones4json, rawr, indent=4)\n", + "\n", + "\n", + "if data_type == \"lab + wav (NNSVS format)\":\n", + " db_converter_script = \"/content/nnsvs-db-converter/db_converter.py\"\n", + " for raw_folder_name in os.listdir(all_shits_not_wav_n_lab):\n", + " raw_folder_path = os.path.join(all_shits_not_wav_n_lab, raw_folder_name)\n", + " if os.path.isdir(raw_folder_path):\n", + " if estimate_midi:\n", + " !python {db_converter_script} -s {max_silence_phoneme_amount} -l {segment_length} -m -c -L \"/content/nnsvs-db-converter/lang.sample.json\" {raw_folder_path}\n", + " else:\n", + " !python {db_converter_script} -s {max_silence_phoneme_amount} -l {segment_length} -L \"/content/nnsvs-db-converter/lang.sample.json\" {raw_folder_path}\n", + " !rm -rf {raw_folder_path}/*.wav {raw_folder_path}/*.lab\n", + " !mv {raw_folder_path}/diffsinger_db/* {raw_folder_path} 2> /dev/null\n", + " !rm -rf {raw_folder_path}/diffsinger_db\n", + " if estimate_midi_option == \"True | SOME\":\n", + " !python /content/SOME/batch_infer.py --model \"/content/DiffSinger/checkpoints/SOME/0119_continuous256_5spk/model_ckpt_steps_100000_simplified.ckpt\" --dataset {raw_folder_path} --overwrite\n", + "\n", + "elif data_type == \"ds (DiffSinger format)\":\n", + " ds_segment_script = \"/content/ghin_shenanigans/scripts/ds_segmentor.py\"\n", + " ds2csv_script = \"/content/MakeDiffSinger/variance-temp-solution/convert_ds.py\"\n", + " for raw_folder_name in os.listdir(all_shits_not_wav_n_lab):\n", + " raw_folder_path = os.path.join(all_shits_not_wav_n_lab, raw_folder_name)\n", + " if os.path.isdir(raw_folder_path):\n", + " ds_exp_path = os.path.join(raw_folder_path, \"ds\")\n", + " csv_exp_path = os.path.join(raw_folder_path, \"transcriptions.csv\")\n", + " !python {ds_segment_script} {raw_folder_path} --export_path {ds_exp_path}\n", + " !rm -rf {raw_folder_path}/*.ds #clean it cus why not\n", + " !python {ds2csv_script} ds2csv {ds_exp_path} {csv_exp_path}\n", + "else:\n", + " pass\n", + "\n", + "# make it replace the first SP to AP cus it seems like people always forgot about it\n", + "for root, _, files in os.walk(all_shits_not_wav_n_lab):\n", + " for file in files:\n", + " if file.endswith(\".csv\"):\n", + " file_path = os.path.join(root, file)\n", + " with open(file_path, \"r\", newline=\"\") as input_file:\n", + " csv_reader = csv.reader(input_file)\n", + " data = [row for row in csv_reader]\n", + " header = data[0]\n", + " if \"ph_seq\" in header:\n", + " ph_seq_index = header.index(\"ph_seq\")\n", + " if len(data) > 1 and len(data[1]) > ph_seq_index:\n", + " data[1][ph_seq_index] = data[1][ph_seq_index].replace(\"SP\", \"AP\", 1)\n", + " with open(file_path, \"w\", newline=\"\") as output_file:\n", + " csv_writer = csv.writer(output_file)\n", + " csv_writer.writerows(data)\n", + "\n", + "print(\"extraction complete!\")\n", + "print(\"|\")\n", + "print(\"|\")\n", + "print(\"|\")\n", + "print(\"I'm also nice enough to convert your data and also write your dictionaries lmao. You are welcome :)\")" + ], + "metadata": { + "cellView": "form", + "id": "JsP1TGg2F1g3", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "7ebf686e-957d-43bc-ddb1-40697e7abec9" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "/content\n", + "\n", + "7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21\n", + "p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.00GHz (50653),ASM,AES-NI)\n", + "\n", + "Scanning the drive for archives:\n", + " 0M Scan /content/drive/MyDrive/dataset/\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b1 file, 162305254 bytes (155 MiB)\n", + "\n", + "Extracting archive: /content/drive/MyDrive/dataset/Kochujang.zip\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.12/dist-packages/pydub/utils.py:300: SyntaxWarning: invalid escape sequence '\\('\n", + " m = re.match('([su]([0-9]{1,2})p?) \\(([0-9]{1,2}) bit\\)$', token)\n", + "/usr/local/lib/python3.12/dist-packages/pydub/utils.py:301: SyntaxWarning: invalid escape sequence '\\('\n", + " m2 = re.match('([su]([0-9]{1,2})p?)( \\(default\\))?$', token)\n", + "/usr/local/lib/python3.12/dist-packages/pydub/utils.py:310: SyntaxWarning: invalid escape sequence '\\('\n", + " elif re.match('(flt)p?( \\(default\\))?$', token):\n", + "/usr/local/lib/python3.12/dist-packages/pydub/utils.py:314: SyntaxWarning: invalid escape sequence '\\('\n", + " elif re.match('(dbl)p?( \\(default\\))?$', token):\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--\n", + "Path = /content/drive/MyDrive/dataset/Kochujang.zip\n", + "Type = zip\n", + "Physical Size = 162305254\n", + "\n", + " 0%\b\b\b\b \b\b\b\b 8% 2 - JP03.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 17% 5 - JP04.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 26% 8 - TH01.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 31% 8 - TH01.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 40% 11 - TH02.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 45% 14 - TH03.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 49% 14 - TH03.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 57% 17 - TH04.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 59% 17 - TH04.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 66% 21 - EN01.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 69% 21 - EN01.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 74% 24 - EN02.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 81% 27 - EN03.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 88% 30 - EN04.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 95% 36 - JP02.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\bEverything is Ok\n", + "\n", + "Files: 37\n", + "Size: 241625397\n", + "Compressed: 162305254\n", + "extraction complete!\n", + "|\n", + "|\n", + "|\n", + "I'm also nice enough to convert your data and also write your dictionaries lmao. You are welcome :)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#@title #Edit Config\n", + "#@markdown ___\n", + "\n", + "import re\n", + "import os\n", + "import yaml\n", + "import random #for the random test files lmaoz\n", + "\n", + "%cd /content\n", + "clear_output()\n", + "#@markdown The model type user is training\n", + "model_type = \"variance\" # @param [\"acoustic\", \"variance\"]\n", + "config_cap = model_type.upper()\n", + "diffusion_type = \"reflow\" # @param [\"ddpm\", \"reflow\"]\n", + "diff_accelerator = \"unipc\" # @param [\"ddim\", \"pndm\", \"dpm-solver\", \"unipc\"]\n", + "loss_type = \"l2\" # @param [\"l1\", \"l2\"]\n", + "\n", + "spk_name = [folder_name for folder_name in os.listdir(all_shits_not_wav_n_lab) if os.path.isdir(os.path.join(all_shits_not_wav_n_lab, folder_name))]\n", + "# i used spk_name for something else cus i forgor now imma just copy and paste it\n", + "spk_names = [folder_name for folder_name in os.listdir(all_shits_not_wav_n_lab) if os.path.isdir(os.path.join(all_shits_not_wav_n_lab, folder_name))]\n", + "num_spk = len(spk_name)\n", + "num_lang = len(dictionary_files)\n", + "raw_dir = []\n", + "datasets = []\n", + "for folder_name in spk_name:\n", + " folder_path = os.path.join(all_shits_not_wav_n_lab, folder_name)\n", + " raw_dir.append(folder_path)\n", + "folder_to_id = {folder_name: i for i, folder_name in enumerate(spk_name)}\n", + "\n", + "if num_spk == 1:\n", + " singer_type = \"SINGLE-SPEAKER\"\n", + " use_spk_id = False\n", + "\n", + " for spk_id, (folder_path, speaker_name) in enumerate(zip(raw_dir, spk_name)):\n", + " if data_type == \"ds (DiffSinger format)\":\n", + " audio_files = [f[:-4] for f in os.listdir(folder_path) if f.endswith(\".ds\")]\n", + " else:\n", + " audio_files = [f[:-4] for f in os.listdir(folder_path + \"/wavs\") if f.endswith(\".wav\")]\n", + " folder_id = folder_to_id.get(speaker_name, -1)\n", + " prefixed_audio_files = [f\"{audio_file}\" for audio_file in audio_files]\n", + "\n", + " random_ass_test_files = prefixed_audio_files[:3]\n", + "\n", + " speaker_name, lang_id = os.path.splitext(speaker_name) #tfw i forgot this last time\n", + "\n", + " datasets.append({\n", + " \"raw_data_dir\": folder_path,\n", + " \"speaker\": speaker_name,\n", + " \"spk_id\": 0,\n", + " \"language\": \"custom\",\n", + " \"test_prefixes\": random_ass_test_files\n", + " })\n", + "else:\n", + " singer_type = \"MULTI-SPEAKER\"\n", + " use_spk_id = True\n", + "\n", + " for spk_id, (folder_path, speaker_name) in enumerate(zip(raw_dir, spk_name)):\n", + " if data_type == \"ds (DiffSinger format)\":\n", + " audio_files = [f[:-4] for f in os.listdir(folder_path) if f.endswith(\".ds\")]\n", + " else:\n", + " audio_files = [f[:-4] for f in os.listdir(folder_path + \"/wavs\") if f.endswith(\".wav\")]\n", + " folder_id = folder_to_id.get(speaker_name, -1)\n", + " prefixed_audio_files = [f\"{audio_file}\" for audio_file in audio_files]\n", + "\n", + " random_ass_test_files = prefixed_audio_files[:3]\n", + "\n", + " speaker_name, lang_id = os.path.splitext(speaker_name) #tfw i forgot this last time\n", + "\n", + " datasets.append({\n", + " \"raw_data_dir\": folder_path,\n", + " \"speaker\": speaker_name,\n", + " \"spk_id\": spk_id,\n", + " \"language\": lang_id.lstrip(\".\") or \"custom\",\n", + " \"test_prefixes\": random_ass_test_files\n", + " })\n", + "\n", + "dictionaries = {}\n", + "for line in dictionary_conf_lines:\n", + " key, value = line.split(\": \", 1)\n", + " dictionaries[key] = value.strip(\"'\")\n", + "\n", + "#@markdown Shallow Diffusion training\n", + "use_shallow_diffusion = \"true | gt_val\" # @param [\"false\", \"true | aux_val\", \"true | gt_val\"]\n", + "if use_shallow_diffusion == \"false\":\n", + " shallow = False\n", + " gt_shallow = False\n", + "elif use_shallow_diffusion == \"true | aux_val\":\n", + " shallow = True\n", + " gt_shallow = False\n", + "else:\n", + " shallow = True\n", + " gt_shallow = True\n", + "\n", + "#@markdown Half precision, or mixed precision can result in improved performance, achieving speedups on training (from [doc](https://lightning.ai/docs/pytorch/stable/common/trainer.html#precision))\n", + "# the reason why i dont add 64 is because colab is already dreadfully slow at 32 so yes im leaving it out\n", + "precision = \"16-mixed\" # @param [\"32-true\", \"bf16-mixed\", \"16-mixed\", \"bf16\", \"16\"]\n", + "\n", + "#@markdown User model save path\n", + "save_dir = \"/content/drive/MyDrive/dataset\" #@param {type:\"string\"}\n", + "\n", + "binary_save_dir = save_dir + \"/binary\"\n", + "\n", + "conf_dir = save_dir\n", + "\n", + "#@markdown ....................................................................................................................................................................................................................................................................................................................................................................................................................................\n", + "\n", + "#@markdown Option to use base model for finetuning\n", + "\n", + "enable_finetuning = True # @param {type:\"boolean\"}\n", + "\n", + "\n", + "#@markdown Path to custom base model, leave blank to use [default](https://github.com/haru0l/diffsinger_models) models\n", + "#wtf haru i just looked at your readme\"\"\"\"\"\n", + "\n", + "base_model_path = \"/content/DiffSinger/checkpoints/nsf_hifigan_44.1k_hop512_128bin_2024.02/model.ckpt\" # @param {type:\"string\"}\n", + "\n", + "if enable_finetuning:\n", + " pretrain = True\n", + " if base_model_path:\n", + " pretrain_ckpt = base_model_path\n", + " else:\n", + " pretrain_ckpt = f\"/content/pretrain_models/{model_type}_pretrain.ckpt\"\n", + " finetune_strict_shapes = False\n", + " finetune_ckpt_path = pretrain_ckpt\n", + "else:\n", + " pretrain = False\n", + " finetune_strict_shapes = True #default value\n", + " finetune_ckpt_path = None #default value\n", + "\n", + "#@markdown ....................................................................................................................................................................................................................................................................................................................................................................................................................................\n", + "\n", + "#@markdown Model embeds check; Tension, Energy, Breathiness, Voicing | for both acoustic and variance\n", + "\n", + "#@markdown we limited the pair up choice to prevent the quality and usage issue, if user wish to enable option(s) outside of these choices then please keep in mind that most of these embeds do not work well together except for [energy + breathiness]\n", + "\n", + "selected_param = \"tension + voicing\" # @param [\"energy\", \"breathiness\", \"energy + breathiness\", \"tension\", \"voicing\", \"tension + voicing\", \"none\"]\n", + "param_flags = {\n", + " \"energy\": {\"tension\": False, \"energy\": True, \"breathiness\": False, \"voicing\": False},\n", + " \"breathiness\": {\"tension\": False, \"energy\": False, \"breathiness\": True, \"voicing\": False},\n", + " \"energy + breathiness\": {\"tension\": False, \"energy\": True, \"breathiness\": True, \"voicing\": False},\n", + " \"tension\": {\"tension\": True, \"energy\": False, \"breathiness\": False, \"voicing\": False},\n", + " \"voicing\": {\"tension\": False, \"energy\": False, \"breathiness\": False, \"voicing\": True},\n", + " \"tension + voicing\": {\"tension\": True, \"energy\": False, \"breathiness\": False, \"voicing\": True},\n", + " \"none\": {\"tension\": False, \"energy\": False, \"breathiness\": False, \"voicing\": False},\n", + "}\n", + "\n", + "flags = param_flags.get(selected_param, param_flags[\"none\"])\n", + "\n", + "tension_training = flags[\"tension\"]\n", + "energy_training = flags[\"energy\"]\n", + "breathiness_training = flags[\"breathiness\"]\n", + "voicing_training = flags[\"voicing\"]\n", + "\n", + "parameter_extraction_method = \"vr\" # @param [\"vr\", \"world\"]\n", + "\n", + "### forcing data aug to be true by default cus i dont think anyone would disable it and its good to be on by default\n", + "data_aug = True #param {type:\"boolean\"}\n", + "\n", + "#@markdown ....................................................................................................................................................................................................................................................................................................................................................................................................................................\n", + "\n", + "#@markdown Model training check | for variance only\n", + "\n", + "\n", + "\n", + "#@markdown due to skill issues, if user wish to train with glide embed, please enable it manually in the config\n", + "pitch_training = \"True | Standard\" # @param [\"False\", \"True | Standard\", \"True | MelodyEncoder\"]\n", + "if pitch_training == \"False\":\n", + " pitch_training = False\n", + " use_melody_encoder = False\n", + " use_glide_embed = False\n", + "elif pitch_training == \"True | Standard\":\n", + " pitch_training = True\n", + " use_melody_encoder = False\n", + " use_glide_embed = False\n", + "else:\n", + " pitch_training = True\n", + " use_melody_encoder = True\n", + " use_glide_embed = False\n", + "\n", + "duration_training = True #@param {type: \"boolean\"}\n", + "\n", + "#@markdown ....................................................................................................................................................................................................................................................................................................................................................................................................................................\n", + "\n", + "#@markdown Pitch extractor algorithm\n", + "\n", + "f0_ext = \"rmvpe\" # @param [\"parselmouth\", \"rmvpe\", \"harvest\"]\n", + "if f0_ext == \"rmvpe\":\n", + " pe_ckpt_pth = \"checkpoints/rmvpe/model.pt\"\n", + "else:\n", + " pe_ckpt_pth = None\n", + "\n", + "#@markdown ....................................................................................................................................................................................................................................................................................................................................................................................................................................\n", + "\n", + "#@markdown Proceeding sections are the parameters that will greatly affect the model's final quality and size. Read about them [here](https://github.com/openvpi/DiffSinger/blob/main/docs/ConfigurationSchemas.md)\n", + "\n", + "#@markdown So if you don't know what they do then please leave these options at default , otherwise it could affect your model badly\n", + "\n", + "#@markdown anyone is welcome to experiment though\n", + "\n", + "#@markdown model_hidden_size: hidden layers for FS2 and token param embeds\n", + "\n", + "#@markdown model_residual_layers | model_residual_channels: the model's main layers and channels\n", + "\n", + "#@markdown ....................................................................................................................................................................................................................................................................................................................................................................................................................................\n", + "#@markdown Model's network/layer size for acoustic\n", + "\n", + "#@markdown The quality of samplig_algorithm is in order, range from euler being the LEAST accurate to rk5 being the MOST accurate.... Though euler works fine on most cases\n", + "sampling_algorithm = \"euler\" # @param [\"euler\", \"rk2\", \"rk4\", \"rk5\"]\n", + "\n", + "acoustic_hidden_size = 256 # @param {type:\"slider\", min:2, max:1024, step:2}\n", + "\n", + "acoustic_num_layers = 6 # @param {type:\"slider\", min:2, max:42, step:2}\n", + "acoustic_num_channels = 1024 # @param {type:\"slider\", min:2, max:2048, step:2}\n", + "\n", + "#@markdown Model's network/layer size for variance\n", + "variance_hidden_size = 256 # @param {type:\"slider\", min:2, max:1024, step:2}\n", + "duration_hidden_size = 512 # @param {type:\"slider\", min:2, max:1024, step:2}\n", + "melody_encoder_hidden_size = 128 # @param {type:\"slider\", min:2, max:1024, step:2}\n", + "\n", + "pitch_num_layers = 6 # @param {type:\"slider\", min:2, max:100, step:2}\n", + "pitch_num_channels = 512 # @param {type:\"slider\", min:2, max:1024, step:2}\n", + "variance_num_layers = 6 # @param {type:\"slider\", min:2, max:100, step:2}\n", + "variance_num_channels = 384 # @param {type:\"slider\", min:2, max:1024, step:2}\n", + "\n", + "\n", + "\n", + "with open(\"/content/DiffSinger/configs/base.yaml\", \"r\") as config:\n", + " mother = yaml.safe_load(config)\n", + "mother[\"pl_trainer_precision\"] = precision\n", + "with open(\"/content/DiffSinger/configs/base.yaml\", \"w\") as config:\n", + " yaml.dump(mother, config)\n", + "\n", + "if data_type == \"ds (DiffSinger format)\":\n", + " prefer_ds = True\n", + "else:\n", + " prefer_ds = False\n", + "\n", + "if model_type == \"acoustic\":\n", + " with open(\"/content/DiffSinger/configs/acoustic.yaml\", \"r\") as config:\n", + " bitch_ass_config = yaml.safe_load(config)\n", + " bitch_ass_config[\"datasets\"] = datasets\n", + " bitch_ass_config[\"num_spk\"] = num_spk\n", + " bitch_ass_config[\"use_spk_id\"] = use_spk_id\n", + " bitch_ass_config[\"extra_phonemes\"] = extra_phonemes\n", + " bitch_ass_config[\"merged_phoneme_groups\"] = merged_phoneme_groups\n", + " bitch_ass_config[\"use_lang_id\"] = bool(merged_phoneme_groups)\n", + " bitch_ass_config[\"num_lang\"] = num_lang\n", + " bitch_ass_config[\"pretrain\"] = pretrain\n", + " bitch_ass_config[\"diffusion_type\"] = diffusion_type\n", + " bitch_ass_config[\"diff_accelerator\"] = diff_accelerator\n", + " bitch_ass_config[\"main_loss_type\"] = loss_type\n", + " bitch_ass_config[\"binary_data_dir\"] = binary_save_dir\n", + " bitch_ass_config[\"dictionaries\"] = dictionaries\n", + " bitch_ass_config[\"augmentation_args\"][\"random_pitch_shifting\"][\"enabled\"] = data_aug\n", + " bitch_ass_config[\"augmentation_args\"][\"random_time_stretching\"][\"enabled\"] = data_aug\n", + " bitch_ass_config[\"use_key_shift_embed\"] = data_aug\n", + " bitch_ass_config[\"use_speed_embed\"] = data_aug\n", + " bitch_ass_config[\"pe\"] = f0_ext\n", + " bitch_ass_config[\"use_energy_embed\"] = energy_training\n", + " bitch_ass_config[\"use_breathiness_embed\"] = breathiness_training\n", + " bitch_ass_config[\"use_tension_embed\"] = tension_training\n", + " bitch_ass_config[\"use_voicing_embed\"] = voicing_training\n", + "\n", + " bitch_ass_config[\"pe_ckpt\"] = pe_ckpt_pth\n", + " bitch_ass_config[\"tension_smooth_width\"] = 0.06 #0.12\n", + " #shallow diff stuff\n", + " bitch_ass_config[\"use_shallow_diffusion\"] = shallow\n", + " bitch_ass_config[\"shallow_diffusion_args\"][\"val_gt_start\"] = gt_shallow\n", + " #finetue stuff\n", + " bitch_ass_config[\"finetune_enabled\"] = enable_finetuning\n", + " bitch_ass_config[\"finetune_ckpt_path\"] = finetune_ckpt_path\n", + " bitch_ass_config[\"finetune_strict_shapes\"] = finetune_strict_shapes\n", + " #vr\n", + " bitch_ass_config[\"hnsep\"] = parameter_extraction_method\n", + " #layers\n", + " bitch_ass_config[\"sampling_algorithm\"] = sampling_algorithm\n", + " bitch_ass_config[\"hidden_size\"] = acoustic_hidden_size\n", + " bitch_ass_config[\"backbone_type\"] = \"lynxnet\"\n", + " bitch_ass_config[\"backbone_args\"][\"num_layers\"] = acoustic_num_layers\n", + " bitch_ass_config[\"backbone_args\"][\"num_channels\"] = acoustic_num_channels\n", + "\n", + " with open(\"/content/DiffSinger/configs/acoustic.yaml\", \"w\") as config:\n", + " yaml.dump(bitch_ass_config, config)\n", + "else:\n", + " with open(\"/content/DiffSinger/configs/variance.yaml\", \"r\") as config:\n", + " bitch_ass_config = yaml.safe_load(config)\n", + " bitch_ass_config[\"datasets\"] = datasets\n", + " bitch_ass_config[\"num_spk\"] = num_spk\n", + " bitch_ass_config[\"use_spk_id\"] = use_spk_id\n", + " bitch_ass_config[\"extra_phonemes\"] = extra_phonemes\n", + " bitch_ass_config[\"merged_phoneme_groups\"] = merged_phoneme_groups\n", + " bitch_ass_config[\"use_lang_id\"] = bool(merged_phoneme_groups)\n", + " bitch_ass_config[\"num_lang\"] = num_lang\n", + " bitch_ass_config[\"main_loss_type\"] = loss_type\n", + " bitch_ass_config[\"diffusion_type\"] = diffusion_type\n", + " bitch_ass_config[\"diff_accelerator\"] = diff_accelerator\n", + " bitch_ass_config[\"binary_data_dir\"] = binary_save_dir\n", + " bitch_ass_config[\"dictionaries\"] = dictionaries\n", + " bitch_ass_config[\"pe\"] = f0_ext # i think variance uses it for pitch ref as ground-truth for pitch training soooo\n", + " bitch_ass_config[\"pe_ckpt\"] = pe_ckpt_pth #same goes to this one\n", + " bitch_ass_config[\"tension_smooth_width\"] = 0.06 #0.12\n", + "\n", + " bitch_ass_config[\"predict_energy\"] = energy_training\n", + " bitch_ass_config[\"predict_breathiness\"] = breathiness_training\n", + " bitch_ass_config[\"predict_tension\"] = tension_training\n", + " bitch_ass_config[\"predict_pitch\"] = pitch_training\n", + " bitch_ass_config[\"predict_voicing\"] = voicing_training\n", + "\n", + " bitch_ass_config[\"use_melody_encoder\"] = use_melody_encoder\n", + " bitch_ass_config[\"use_glide_embed\"] = use_glide_embed\n", + " bitch_ass_config[\"predict_dur\"] = duration_training\n", + " bitch_ass_config[\"binarization_args\"][\"prefer_ds\"] = prefer_ds\n", + " #finetune stuff\n", + " bitch_ass_config[\"finetune_enabled\"] = enable_finetuning\n", + " bitch_ass_config[\"finetune_ckpt_path\"] = finetune_ckpt_path\n", + " bitch_ass_config[\"finetune_strict_shapes\"] = finetune_strict_shapes\n", + " #vr\n", + " bitch_ass_config[\"hnsep\"] = parameter_extraction_method\n", + " bitch_ass_config[\"hnsep_ckpt\"] = \"checkpoints/vr/model.pt\"\n", + " #layers\n", + " bitch_ass_config[\"hidden_size\"] = variance_hidden_size\n", + " bitch_ass_config[\"dur_prediction_args\"][\"hidden_size\"] = duration_hidden_size\n", + " bitch_ass_config[\"melody_encoder_args\"][\"hidden_size\"] = melody_encoder_hidden_size\n", + " bitch_ass_config[\"variances_prediction_args\"][\"backbone_type\"] = \"lynxnet\"\n", + " bitch_ass_config[\"variances_prediction_args\"][\"backbone_args\"][\"num_layers\"] = variance_num_layers\n", + " bitch_ass_config[\"variances_prediction_args\"][\"backbone_args\"][\"num_channels\"] = variance_num_channels\n", + " bitch_ass_config[\"pitch_prediction_args\"][\"backbone_type\"] = \"lynxnet\"\n", + " bitch_ass_config[\"pitch_prediction_args\"][\"backbone_args\"][\"num_layers\"] = pitch_num_layers\n", + " bitch_ass_config[\"pitch_prediction_args\"][\"backbone_args\"][\"num_channels\"] = pitch_num_channels\n", + "\n", + " with open(\"/content/DiffSinger/configs/variance.yaml\", \"w\") as config:\n", + " yaml.dump(bitch_ass_config, config)\n", + "\n", + "os.makedirs(save_dir, exist_ok=True)\n", + "\n", + "\n", + "with open(\"/content/DiffSinger/utils/hparams.py\", \"r\") as f:\n", + " hparams_py_read = f.read()\n", + "hparams_py_read = re.sub(r\"args_work_dir\\s*=\\s*.*\", f\"args_work_dir = '{save_dir}'\", hparams_py_read)\n", + "with open(\"/content/DiffSinger/utils/hparams.py\", \"w\") as f:\n", + " f.write(hparams_py_read)\n", + "\n", + "with open(\"/content/DiffSinger/utils/training_utils.py\", \"r\") as f:\n", + " training_utils_stuff = f.read()\n", + "training_utils_stuff = re.sub(\"relative_path\\s*=\\s*.*\", \"relative_path = filepath.relative_to(Path('/content').resolve())\", training_utils_stuff)\n", + "with open(\"/content/DiffSinger/utils/training_utils.py\", \"w\") as f:\n", + " f.write(training_utils_stuff)\n", + "\n", + "spk_names = [os.path.splitext(name)[0] for name in spk_names]\n", + "dict_dir = os.path.dirname(dict_path)\n", + "\n", + "print(\"config updated! see below for config's information\")\n", + "print(\"|\")\n", + "print(\"|\")\n", + "print(\"|\")\n", + "print(f\"+++---{config_cap} {singer_type} TRAINING---+++\")\n", + "print(\"|\")\n", + "print(\"|\")\n", + "print(\"|\")\n", + "print(\"+++---user's settings---+++\")\n", + "print(\"\\n\")\n", + "print(f\"speaker name: {spk_names}\")\n", + "print(\"\\n\")\n", + "print(f\"data augmentation: {data_aug}\")\n", + "print(\"\\n\")\n", + "print(f\"pitch extractor: {f0_ext}\")\n", + "print(\"\\n\")\n", + "print(f\"binary data save directory: {binary_save_dir}\")\n", + "print(\"\\n\")\n", + "print(f\"your model will be saved to: {save_dir}\")\n", + "print(\"\\n\")\n", + "print(\"==========================================================================================\")\n", + "print(\"\\n\")\n", + "print(\"+++---other auto-defined settings---+++\")\n", + "#print(\"\\n\")\n", + "#print(f\"test files (auto selected): {random_ass_test_files}\")\n", + "print(\"\\n\")\n", + "print(f\"dictionary (auto generated): {dict_dir} (check this directory)\")\n", + "print(\"\\n\")\n", + "print(\"==========================================================================================\")\n", + "print(\"\\n\")\n", + "print(\"if you don't like or disagree with any of these options,\")\n", + "print(f\"you can go and edit the config at [/content/DiffSinger/configs/{model_type}.yaml]\")\n" + ], + "metadata": { + "cellView": "form", + "id": "nI3dzDv_Mr9Y", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "dc2c2cf6-4827-410d-81b7-0c5377e5670f" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "config updated! see below for config's information\n", + "|\n", + "|\n", + "|\n", + "+++---VARIANCE MULTI-SPEAKER TRAINING---+++\n", + "|\n", + "|\n", + "|\n", + "+++---user's settings---+++\n", + "\n", + "\n", + "speaker name: []\n", + "\n", + "\n", + "data augmentation: True\n", + "\n", + "\n", + "pitch extractor: rmvpe\n", + "\n", + "\n", + "binary data save directory: /content/drive/MyDrive/dataset/binary\n", + "\n", + "\n", + "your model will be saved to: /content/drive/MyDrive/dataset\n", + "\n", + "\n", + "==========================================================================================\n", + "\n", + "\n", + "+++---other auto-defined settings---+++\n", + "\n", + "\n", + "dictionary (auto generated): /content/DiffSinger/dictionaries (check this directory)\n", + "\n", + "\n", + "==========================================================================================\n", + "\n", + "\n", + "if you don't like or disagree with any of these options,\n", + "you can go and edit the config at [/content/DiffSinger/configs/variance.yaml]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#@markdown # Preprocess data\n", + "import os\n", + "#we dont need that old f0 limit change anymore <3\n", + "training_config = f\"/content/DiffSinger/configs/{model_type}.yaml\"\n", + "%cd /content/DiffSinger\n", + "os.environ['PYTHONPATH']='.'\n", + "!CUDA_VISIBLE_DEVICES=0 python /content/DiffSinger/scripts/binarize.py --config {training_config} --reset" + ], + "metadata": { + "cellView": "form", + "id": "76NvDR1cXlDM", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "8de8d8c2-cef2-4b87-a3cc-e133ac12f1c2" + }, + "execution_count": 13, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "/content/DiffSinger\n", + "| Hparams chains: ['configs/base.yaml', '/content/DiffSinger/configs/variance.yaml']\n", + "| Hparams: \n", + "\u001b[0;33mK_step\u001b[0m: 1000, \u001b[0;33maccumulate_grad_batches\u001b[0m: 1, \u001b[0;33maudio_sample_rate\u001b[0m: 44100, \u001b[0;33mbase_config\u001b[0m: ['configs/base.yaml'], \u001b[0;33mbinarization_args\u001b[0m: {'num_workers': 0, 'shuffle': True, 'prefer_ds': True}, \n", + "\u001b[0;33mbinarizer_cls\u001b[0m: preprocessing.variance_binarizer.VarianceBinarizer, \u001b[0;33mbinary_data_dir\u001b[0m: /content/drive/MyDrive/dataset/binary, \u001b[0;33mbreathiness_db_max\u001b[0m: -20.0, \u001b[0;33mbreathiness_db_min\u001b[0m: -96.0, \u001b[0;33mbreathiness_smooth_width\u001b[0m: 0.12, \n", + "\u001b[0;33mclip_grad_norm\u001b[0m: 1, \u001b[0;33mdataloader_prefetch_factor\u001b[0m: 2, \u001b[0;33mdataset_size_key\u001b[0m: lengths, \u001b[0;33mdatasets\u001b[0m: [], \u001b[0;33mdictionaries\u001b[0m: {'custom': '/content/DiffSinger/dictionaries/dictionary-custom.txt'}, \n", + "\u001b[0;33mdiff_accelerator\u001b[0m: unipc, \u001b[0;33mdiff_speedup\u001b[0m: 10, \u001b[0;33mdiffusion_type\u001b[0m: reflow, \u001b[0;33mdropout\u001b[0m: 0.1, \u001b[0;33mds_workers\u001b[0m: 4, \n", + "\u001b[0;33mdur_prediction_args\u001b[0m: {'arch': 'fs2', 'dropout': 0.1, 'hidden_size': 512, 'kernel_size': 3, 'lambda_pdur_loss': 0.3, 'lambda_sdur_loss': 3.0, 'lambda_wdur_loss': 1.0, 'log_offset': 1.0, 'loss_type': 'mse', 'num_layers': 5}, \u001b[0;33menc_ffn_kernel_size\u001b[0m: 3, \u001b[0;33menc_layers\u001b[0m: 4, \u001b[0;33menergy_db_max\u001b[0m: -12.0, \u001b[0;33menergy_db_min\u001b[0m: -96.0, \n", + "\u001b[0;33menergy_smooth_width\u001b[0m: 0.12, \u001b[0;33mexp_name\u001b[0m: , \u001b[0;33mextra_phonemes\u001b[0m: [], \u001b[0;33mf0_max\u001b[0m: 1100, \u001b[0;33mf0_min\u001b[0m: 65, \n", + "\u001b[0;33mffn_act\u001b[0m: gelu, \u001b[0;33mfft_size\u001b[0m: 2048, \u001b[0;33mfinetune_ckpt_path\u001b[0m: /content/DiffSinger/checkpoints/nsf_hifigan_44.1k_hop512_128bin_2024.02/model.ckpt, \u001b[0;33mfinetune_enabled\u001b[0m: True, \u001b[0;33mfinetune_ignored_params\u001b[0m: ['model.spk_embed', 'model.fs2.txt_embed', 'model.fs2.encoder.embed_tokens'], \n", + "\u001b[0;33mfinetune_strict_shapes\u001b[0m: False, \u001b[0;33mfreezing_enabled\u001b[0m: False, \u001b[0;33mfrozen_params\u001b[0m: [], \u001b[0;33mglide_embed_scale\u001b[0m: 11.31370849898476, \u001b[0;33mglide_types\u001b[0m: ['up', 'down'], \n", + "\u001b[0;33mhidden_size\u001b[0m: 256, \u001b[0;33mhnsep\u001b[0m: vr, \u001b[0;33mhnsep_ckpt\u001b[0m: checkpoints/vr/model.pt, \u001b[0;33mhop_size\u001b[0m: 512, \u001b[0;33minfer\u001b[0m: False, \n", + "\u001b[0;33mlambda_dur_loss\u001b[0m: 1.0, \u001b[0;33mlambda_pitch_loss\u001b[0m: 1.0, \u001b[0;33mlambda_var_loss\u001b[0m: 1.0, \u001b[0;33mlog_interval\u001b[0m: 100, \u001b[0;33mlr_scheduler_args\u001b[0m: {'gamma': 0.75, 'scheduler_cls': 'torch.optim.lr_scheduler.StepLR', 'step_size': 10000}, \n", + "\u001b[0;33mmain_loss_log_norm\u001b[0m: True, \u001b[0;33mmain_loss_type\u001b[0m: l2, \u001b[0;33mmax_batch_frames\u001b[0m: 80000, \u001b[0;33mmax_batch_size\u001b[0m: 48, \u001b[0;33mmax_beta\u001b[0m: 0.02, \n", + "\u001b[0;33mmax_updates\u001b[0m: 160000, \u001b[0;33mmax_val_batch_frames\u001b[0m: 60000, \u001b[0;33mmax_val_batch_size\u001b[0m: 1, \u001b[0;33mmelody_encoder_args\u001b[0m: {'enc_layers': 4, 'hidden_size': 128}, \u001b[0;33mmerged_phoneme_groups\u001b[0m: [], \n", + "\u001b[0;33mmidi_smooth_width\u001b[0m: 0.06, \u001b[0;33mnccl_p2p\u001b[0m: True, \u001b[0;33mnum_ckpt_keep\u001b[0m: 5, \u001b[0;33mnum_heads\u001b[0m: 2, \u001b[0;33mnum_lang\u001b[0m: 1, \n", + "\u001b[0;33mnum_sanity_val_steps\u001b[0m: 1, \u001b[0;33mnum_spk\u001b[0m: 0, \u001b[0;33mnum_valid_plots\u001b[0m: 10, \u001b[0;33moptimizer_args\u001b[0m: {'beta1': 0.9, 'beta2': 0.98, 'lr': 0.0006, 'optimizer_cls': 'torch.optim.AdamW', 'weight_decay': 0}, \u001b[0;33mpe\u001b[0m: rmvpe, \n", + "\u001b[0;33mpe_ckpt\u001b[0m: checkpoints/rmvpe/model.pt, \u001b[0;33mpermanent_ckpt_interval\u001b[0m: 10000, \u001b[0;33mpermanent_ckpt_start\u001b[0m: 80000, \u001b[0;33mpitch_prediction_args\u001b[0m: {'backbone_args': {'dilation_cycle_length': 5, 'num_channels': 512, 'num_layers': 6}, 'backbone_type': 'lynxnet', 'pitd_clip_max': 12.0, 'pitd_clip_min': -12.0, 'pitd_norm_max': 8.0, 'pitd_norm_min': -8.0, 'repeat_bins': 64}, \u001b[0;33mpl_trainer_accelerator\u001b[0m: auto, \n", + "\u001b[0;33mpl_trainer_devices\u001b[0m: auto, \u001b[0;33mpl_trainer_num_nodes\u001b[0m: 1, \u001b[0;33mpl_trainer_precision\u001b[0m: 16-mixed, \u001b[0;33mpl_trainer_strategy\u001b[0m: {'find_unused_parameters': False, 'name': 'auto', 'process_group_backend': 'nccl'}, \u001b[0;33mpredict_breathiness\u001b[0m: False, \n", + "\u001b[0;33mpredict_dur\u001b[0m: True, \u001b[0;33mpredict_energy\u001b[0m: False, \u001b[0;33mpredict_pitch\u001b[0m: True, \u001b[0;33mpredict_tension\u001b[0m: True, \u001b[0;33mpredict_voicing\u001b[0m: True, \n", + "\u001b[0;33mrel_pos\u001b[0m: True, \u001b[0;33mrope_interleaved\u001b[0m: False, \u001b[0;33msampler_frame_count_grid\u001b[0m: 6, \u001b[0;33msampling_algorithm\u001b[0m: euler, \u001b[0;33msampling_steps\u001b[0m: 20, \n", + "\u001b[0;33mschedule_type\u001b[0m: linear, \u001b[0;33msort_by_len\u001b[0m: True, \u001b[0;33mtask_cls\u001b[0m: training.variance_task.VarianceTask, \u001b[0;33mtension_logit_max\u001b[0m: 10.0, \u001b[0;33mtension_logit_min\u001b[0m: -10.0, \n", + "\u001b[0;33mtension_smooth_width\u001b[0m: 0.06, \u001b[0;33mtime_scale_factor\u001b[0m: 1000, \u001b[0;33mtimesteps\u001b[0m: 1000, \u001b[0;33muse_glide_embed\u001b[0m: False, \u001b[0;33muse_lang_id\u001b[0m: False, \n", + "\u001b[0;33muse_melody_encoder\u001b[0m: False, \u001b[0;33muse_pos_embed\u001b[0m: True, \u001b[0;33muse_rope\u001b[0m: True, \u001b[0;33muse_spk_id\u001b[0m: True, \u001b[0;33mval_check_interval\u001b[0m: 2000, \n", + "\u001b[0;33mvariances_prediction_args\u001b[0m: {'backbone_args': {'dilation_cycle_length': 4, 'num_channels': 384, 'num_layers': 6}, 'backbone_type': 'lynxnet', 'total_repeat_bins': 48}, \u001b[0;33mvoicing_db_max\u001b[0m: -12.0, \u001b[0;33mvoicing_db_min\u001b[0m: -96.0, \u001b[0;33mvoicing_smooth_width\u001b[0m: 0.12, \u001b[0;33mwin_size\u001b[0m: 2048, \n", + "\u001b[0;33mwork_dir\u001b[0m: /content/drive/MyDrive/dataset, \n", + "| Binarizer: \n", + "Traceback (most recent call last):\n", + " File \"/content/DiffSinger/scripts/binarize.py\", line 25, in \n", + " binarize()\n", + " File \"/content/DiffSinger/scripts/binarize.py\", line 21, in binarize\n", + " binarizer_cls().process()\n", + " ^^^^^^^^^^^^^^^\n", + " File \"/content/DiffSinger/preprocessing/variance_binarizer.py\", line 67, in __init__\n", + " super().__init__(data_attrs=VARIANCE_ITEM_ATTRIBUTES)\n", + " File \"/content/DiffSinger/basics/base_binarizer.py\", line 60, in __init__\n", + " self.build_spk_map()\n", + " File \"/content/DiffSinger/basics/base_binarizer.py\", line 85, in build_spk_map\n", + " assert max(spk_ids) < hparams['num_spk'], \\\n", + " ^^^^^^^^^^^^\n", + "ValueError: max() iterable argument is empty\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# **Training**" + ], + "metadata": { + "id": "0J3b18EKdzMC" + } + }, + { + "cell_type": "code", + "source": [ + "#@markdown #Train your model\n", + "%cd /content/DiffSinger\n", + "import re\n", + "import os\n", + "import yaml\n", + "#@markdown ___\n", + "\n", + "#@markdown Step interval of when your model will be validate and save\n", + "save_interval = 2000 #@param {type:\"slider\", min:100, max:10000, step:100}\n", + "\n", + "#@markdown batch size setting, too low can cause bottleneck, too high can cause oom\n", + "batch_size = 9 # @param {type:\"slider\", min:1, max:100, step:1}\n", + "\n", + "#@markdown step interval of when your model will stop training automatically\n", + "max_updates = 160000 # @param {type:\"slider\", min:100, max:2000000, step:100}\n", + "\n", + "#@markdown ___\n", + "\n", + "#@markdown ###**Only edit this section if you want to resume training**\n", + "resume_training = False #@param {type:\"boolean\"}\n", + "\n", + "#@markdown select this option if you locally binarized your data | this option will only append your binary data path in your config | \"binary\" folder must be in the same directory as config.yaml\n", + "local_data = False #@param {type:\"boolean\"}\n", + "\n", + "#@markdown path to the config you got from training\n", + "re_config_path = \"\" #@param {type:\"string\"}\n", + "model_dir = os.path.dirname(re_config_path)\n", + "save_dir = model_dir\n", + "if resume_training:\n", + " with open(\"/content/DiffSinger/utils/hparams.py\", \"r\") as f:\n", + " hparams_py_read = f.read()\n", + " hparams_py_read = re.sub(r\"args_work_dir\\s*=\\s*.*\", f\"args_work_dir = '{save_dir}'\", hparams_py_read)\n", + " with open(\"/content/DiffSinger/utils/hparams.py\", \"w\") as f:\n", + " f.write(hparams_py_read)\n", + " with open(\"/content/DiffSinger/utils/training_utils.py\", \"r\") as f:\n", + " training_utils_stuff = f.read()\n", + " training_utils_stuff = re.sub(\"relative_path\\s*=\\s*.*\", \"relative_path = filepath.relative_to(Path('/content').resolve())\", training_utils_stuff)\n", + " with open(\"/content/DiffSinger/utils/training_utils.py\", \"w\") as f:\n", + " f.write(training_utils_stuff)\n", + "\n", + " config_path = re_config_path\n", + " log_dir = save_dir\n", + "\n", + " !cp {model_dir}/*.txt /content/DiffSinger/dictionaries\n", + "\n", + "else:\n", + " config_path = training_config\n", + " log_dir = conf_dir\n", + "\n", + "with open(config_path, \"r\") as config:\n", + " ehe = yaml.safe_load(config)\n", + "config_dir = os.path.dirname(config_path)\n", + "yuh = os.path.join(config_dir, \"binary\")\n", + "\n", + "ehe[\"val_check_interval\"] = save_interval\n", + "ehe[\"max_batch_size\"] = batch_size\n", + "ehe[\"max_updates\"] = max_updates\n", + "if local_data:\n", + " ehe[\"binary_data_dir\"] = yuh\n", + "with open(config_path, \"w\") as config:\n", + " yaml.dump(ehe, config)\n", + "\n", + "logs = log_dir\n", + "%reload_ext tensorboard\n", + "%tensorboard --logdir {logs}/lightning_logs\n", + "\n", + "!python /content/DiffSinger/scripts/train.py --config {config_path} --exp_name ${save_dir} --reset" + ], + "metadata": { + "cellView": "form", + "id": "Lu5w72UWgccC" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# **Convert model to ONNX format**" + ], + "metadata": { + "id": "FY40fGHEg9_i" + } + }, + { + "cell_type": "code", + "source": [ + "#@markdown # Drop Speakers from Model (Optional)\n", + "#@markdown ___\n", + "#@markdown Use this to drop speakers from your model for distribution. You will need to do it for both acoustic and variance models.\n", + "\n", + "drop_model_path = '' #@param {type: \"string\"}\n", + "#@markdown Type the ID of speakers you'd like to KEEP separated by commas. Ex: \"0,3,4\"
\n", + "#@markdown Note: You can find the ID of speakers in the model by opening the ```spk_map.json``` file in the model folder.
\n", + "#@markdown If you see ```{\"natural\": 0, \"power\": 1, \"silly\": 2}``` but only want to keep \"natural\" and \"power\", type ```0,1``` below.\n", + "retain_speakers = '' #@param {type: \"string\"}\n", + "#@markdown If you don't know what this means, don't change it.\n", + "fill_embed = 'zeros' #@param ['zeros', 'random', 'mean', 'cyclic']\n", + "\n", + "drop_out_path = drop_model_path[:-5] + '_spk-dropped.ckpt'\n", + "\n", + "!python /content/DiffSinger/scripts/drop_spk.py {drop_model_path} {drop_out_path} --retain {retain_speakers} --fill {fill_embed}\n", + "\n", + "\n" + ], + "metadata": { + "id": "21ILzW4OEnh4", + "cellView": "form" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#@markdown # Export ONNX\n", + "#@markdown ___\n", + "%cd /content\n", + "from IPython.display import clear_output\n", + "clear_output()\n", + "import os\n", + "import zipfile\n", + "import shutil\n", + "\n", + "if export_mode:\n", + " pass\n", + "else:\n", + " print(\"Installing components to make ONNX work\")\n", + " !wget -O /content/mini.sh https://repo.anaconda.com/miniconda/Miniconda3-py310_25.1.1-2-Linux-x86_64.sh\n", + " !chmod +x /content/mini.sh\n", + " !bash /content/mini.sh -b -f -p /usr/local\n", + " !conda install -q -y jupyter\n", + " !conda install -q -y google-colab -c conda-forge\n", + " !python -m ipykernel install --name \"py310\" --user\n", + " print(\"installing dependencies for ONNX conversion\")\n", + " !pip install -r /content/DiffSinger/requirements-onnx.txt -q -q -q 2>/dev/null\n", + " print(\"Installation complete, time to export those ONNX!\")\n", + "# to counter IF the user is to re-run this cell <3\n", + "if os.path.exists(\"/content/OU_compatible_files\"):\n", + " shutil.rmtree(\"/content/OU_compatible_files\")\n", + " os.remove(\"/content/jpn_dict.txt\")\n", + "else:\n", + " pass\n", + "\n", + "#@markdown select this if you don't want to see the onnx converter's output\n", + "no_output = True # @param {type:\"boolean\"}\n", + "\n", + "#@markdown path to your **ACOUSTIC CHECKPOINT** (leave blank if you don't have any): automatically use latest checkpoint that is in the same folder\n", + "acoustic_checkpoint_path = \"\" #@param{type:\"string'}\n", + "acoustic_folder_name = os.path.basename(os.path.dirname(acoustic_checkpoint_path)) + \"_acoustic\"\n", + "acoustic_folder_path = os.path.dirname(acoustic_checkpoint_path)\n", + "\n", + "#@markdown path to your **VARIANCE CHECKPOINT** (leave blank if you don't have any): automatically use latest checkpoint that is in the same folder\n", + "variance_checkpoint_path = \"\" #@param{type:\"string'}\n", + "variance_folder_name = os.path.basename(os.path.dirname(variance_checkpoint_path)) + \"_variance\"\n", + "variance_folder_path = os.path.dirname(variance_checkpoint_path)\n", + "\n", + "#@markdown path to where you want to save your ONNX files (it will create a folder named \"onnx\" in this path)\n", + "exp_folder = \"\" #@param{type:\"string\"}\n", + "\n", + "acoustic_onnx_exp = exp_folder + \"/onnx/acoustic\"\n", + "variance_onnx_exp = exp_folder + \"/onnx/variance\"\n", + "\n", + "if not acoustic_checkpoint_path:\n", + " print(\"\\n\")\n", + " print(\"acoustic ckeckpoint path not specified, not exporting acoustic ONNX...\")\n", + "else:\n", + " print(\"\\n\")\n", + " print(\"converting acoustic to onnx...\")\n", + " #cp stuff cus apparently exporter doesnt work without it\n", + " !cp {acoustic_folder_path}/config.yaml -r /content/DiffSinger/checkpoints/{acoustic_folder_name}\n", + " search_text = \" args_work_dir = os.path.join(\"\n", + " replacement = f\" args_work_dir = '{acoustic_folder_path}'\"\n", + " with open(\"/content/DiffSinger/utils/hparams.py\", \"r\") as file:\n", + " lines = file.readlines()\n", + " for i, line in enumerate(lines):\n", + " if search_text in line:\n", + " lines[i] = replacement + \"\\n\"\n", + " break\n", + " with open(\"/content/DiffSinger/utils/hparams.py\", \"w\") as file:\n", + " file.writelines(lines)\n", + " #incase if anyone wanna change it lmao\n", + " search_text_alt = \" args_work_dir = '\"\n", + " replacement_alt = f\" args_work_dir = '{acoustic_folder_path}'\"\n", + " with open(\"/content/DiffSinger/utils/hparams.py\", \"r\") as file:\n", + " lines = file.readlines()\n", + " for i, line in enumerate(lines):\n", + " if search_text_alt in line:\n", + " lines[i] = replacement_alt + \"\\n\"\n", + " break\n", + " with open(\"/content/DiffSinger/utils/hparams.py\", \"w\") as file:\n", + " file.writelines(lines)\n", + "\n", + " if no_output:\n", + " !python /content/DiffSinger/scripts/export.py acoustic --exp {acoustic_folder_name} --out {exp_folder}/onnx/acoustic >/dev/null 2>&1\n", + " else:\n", + " !python /content/DiffSinger/scripts/export.py acoustic --exp {acoustic_folder_name} --out {exp_folder}/onnx/acoustic\n", + "\n", + "\n", + "if not variance_checkpoint_path:\n", + " print(\"\\n\")\n", + " print(\"variance ckeckpoint path not specified, not exporting variance ONNX...\")\n", + "else:\n", + " print(\"\\n\")\n", + " print(\"converting variance to onnx...\")\n", + " #cp stuff cus apparently exporter doesnt work without it\n", + " !cp {variance_folder_path}/config.yaml -r /content/DiffSinger/checkpoints/{variance_folder_name}\n", + " search_text = \" args_work_dir = os.path.join(\"\n", + " replacement = f\" args_work_dir = '{variance_folder_path}'\"\n", + " with open(\"/content/DiffSinger/utils/hparams.py\", \"r\") as file:\n", + " lines = file.readlines()\n", + " for i, line in enumerate(lines):\n", + " if search_text in line:\n", + " lines[i] = replacement + \"\\n\"\n", + " break\n", + " with open(\"/content/DiffSinger/utils/hparams.py\", \"w\") as file:\n", + " file.writelines(lines)\n", + " #incase if anyone wanna change it lmao\n", + " search_text_alt = \" args_work_dir = '\"\n", + " replacement_alt = f\" args_work_dir = '{variance_folder_path}'\"\n", + " with open(\"/content/DiffSinger/utils/hparams.py\", \"r\") as file:\n", + " lines = file.readlines()\n", + " for i, line in enumerate(lines):\n", + " if search_text_alt in line:\n", + " lines[i] = replacement_alt + \"\\n\"\n", + " break\n", + " with open(\"/content/DiffSinger/utils/hparams.py\", \"w\") as file:\n", + " file.writelines(lines)\n", + " if no_output:\n", + " !python /content/DiffSinger/scripts/export.py variance --exp {variance_folder_name} --out {exp_folder}/onnx/variance >/dev/null 2>&1\n", + " else:\n", + " !python /content/DiffSinger/scripts/export.py variance --exp {variance_folder_name} --out {exp_folder}/onnx/variance\n", + "\n", + "\n", + "if not variance_checkpoint_path:\n", + " folder_paths = [acoustic_onnx_exp]\n", + "elif not acoustic_checkpoint_path:\n", + " folder_paths = [variance_onnx_exp]\n", + "else:\n", + " folder_paths = [acoustic_onnx_exp, variance_onnx_exp]\n", + "\n", + "patterns = {\"acoustic.onnx\": \"acoustic.onnx\", \"dur.onnx\": \"dur.onnx\", \"linguistic.onnx\": \"linguistic.onnx\", \"pitch.onnx\": \"pitch.onnx\", \"variance.onnx\": \"variance.onnx\", \"phonemes.txt\": \"phonemes.txt\"}\n", + "\n", + "for folder_path in folder_paths:\n", + " for filename in os.listdir(folder_path):\n", + " for pattern, new_name in patterns.items():\n", + " if pattern in filename:\n", + " old_path = os.path.join(folder_path, filename)\n", + " new_path = os.path.join(folder_path, new_name)\n", + " if os.path.exists(old_path):\n", + " os.rename(old_path, new_path)\n", + "for folder_path in folder_paths:\n", + " for filename in os.listdir(folder_path):\n", + " if \"acoustic_acoustic.\" in filename:\n", + " new_filename = filename.replace(\"acoustic_acoustic.\", \"acoustic_\")\n", + " elif \"variance_variance.\" in filename:\n", + " new_filename = filename.replace(\"variance_variance.\", \"variance_\")\n", + " else:\n", + " new_filename = filename\n", + " old_path = os.path.join(folder_path, filename)\n", + " new_path = os.path.join(folder_path, new_filename)\n", + " os.rename(old_path, new_path)\n", + "print(\"\\n\")\n", + "print(\"ONNX export complete! Please refer to https://github.com/xunmengshe/OpenUtau/wiki/Voicebank-Development to make your model OU compatible\")\n", + "print(\"\\n\")\n", + "print(\"Or use the 'Build OpenUtau VB' cell to have things set up for you\")\n" + ], + "metadata": { + "id": "x33iZhZchEMW", + "cellView": "form" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# **Miscellaneous**" + ], + "metadata": { + "id": "4sbU1aH5kGFE" + } + }, + { + "cell_type": "code", + "source": [ + "#@title #Raw data conversion\n", + "#@markdown ___\n", + "%cd /content\n", + "#@markdown This cell will export .lab and .ds files along with your data\n", + "\n", + "data_type = \"lab + wav (NNSVS format)\" # @param [\"lab + wav (NNSVS format)\"]\n", + "\n", + "#@markdown The path to your data zip file\n", + "\n", + "data_zip_path = \"\" #@param {type:\"string\"}\n", + "\n", + "#@markdown The path you will be saving the data to\n", + "\n", + "data_save_path = \"\" #@param {type:\"string\"}\n", + "\n", + "#@markdown ___\n", + "\n", + "export_ds = True\n", + "\n", + "#@markdown _These values can exceed the amount that's in your data to maximize the segment length or to keep the data as is_\n", + "\n", + "#@markdown Determine how long it will segment your data to based on silence phoneme placement (seconds)\n", + "segment_length = 15 #@param {type:\"slider\", min:5, max:35, step:1}\n", + "\n", + "#@markdown Determine how many silence phoneme is allowed in the middle of each segment\n", + "max_silence_phoneme_amount = 2 #@param {type:\"slider\", min:0, max:50, step:1}\n", + "\n", + "# leaving -S at 60 so max silence can be 60 seconds that exceeds the segment legnth cap idk why///\n", + "# making the segment length cap at 35 secs because any longer than that would make training goes really slow\n", + "\n", + "# my ass dont remember why i made two... i think one is unnecessary extra but mehhh\n", + "all_shits = \"/content/raw_data\"\n", + "all_shits_not_wav_n_lab = \"/content/raw_data/diffsinger_db\"\n", + "\n", + "import os\n", + "import csv\n", + "import json\n", + "import shutil\n", + "from pydub import AudioSegment\n", + "\n", + "if os.path.exists(\"/content/raw_data\"):\n", + " shutil.rmtree(\"/content/raw_data\")\n", + "\n", + "if not os.path.exists(all_shits_not_wav_n_lab):\n", + " os.makedirs(all_shits_not_wav_n_lab)\n", + "\n", + "# using 'if not' bc i edited the wrong section which im also too lazy to fix it <3\n", + "if not data_type == \"lab + wav (NNSVS format)\":\n", + " #changed to 7zip to support more compression types\n", + " !7z x \"$data_zip_path\" -o{all_shits_not_wav_n_lab}\n", + " for root, dirs, files in os.walk(all_shits):\n", + " for filename in files:\n", + " if filename.endswith(\".lab\"):\n", + " file_path = os.path.join(root, filename)\n", + " with open(file_path, \"r\") as file:\n", + " file_data = file.read()\n", + " file_data = file_data.replace(\"SP\", \"pau\")\n", + " file_data = file_data.replace(\"br\", \"AP\")\n", + " with open(file_path, \"w\") as file:\n", + " file.write(file_data)\n", + "\n", + "else:\n", + " !7z x \"$data_zip_path\" -o{all_shits_not_wav_n_lab}\n", + "\n", + "\n", + "# for funny auto dict generator lmao\n", + "out = \"/content/raw_data/custom_dict.txt\"\n", + "\n", + "phonemes = set()\n", + "\n", + "def is_excluded(phoneme):\n", + " return phoneme in [\"pau\", \"AP\", \"SP\"]\n", + "\n", + "if data_type == \"lab + wav (NNSVS format)\":\n", + " phoneme_folder_path = all_shits\n", + " for root, dirs, files in os.walk(phoneme_folder_path):\n", + " for file in files:\n", + " if file.endswith(\".lab\"):\n", + " fpath = os.path.join(root, file)\n", + " with open(fpath, \"r\") as lab_file:\n", + " for line in lab_file:\n", + " line = line.strip()\n", + " if line:\n", + " phoneme = line.split()[2]\n", + " if not is_excluded(phoneme):\n", + " phonemes.add(phoneme)\n", + "\n", + "with open(out, \"w\") as f:\n", + " for phoneme in sorted(phonemes):\n", + " f.write(phoneme + \"\t\" + phoneme + \"\\n\")\n", + "\n", + "# for vowels and consonants.txt.... well adding liquid type for uta's script\n", + "dict_path = out\n", + "vowel_types = {\"a\", \"i\", \"u\", \"e\", \"o\", \"N\", \"M\", \"NG\"}\n", + "liquid_types = {\"y\", \"w\", \"l\", \"r\"} # r for english labels, it should be fine with jp too\n", + "vowel_data = []\n", + "consonant_data = []\n", + "liquid_data = []\n", + "\n", + "with open(dict_path, \"r\") as f:\n", + " for line in f:\n", + " phoneme, _ = line.strip().split(\"\\t\")\n", + " if phoneme[0] in vowel_types:\n", + " vowel_data.append(phoneme)\n", + " elif phoneme[0] in liquid_types:\n", + " liquid_data.append(phoneme)\n", + " else:\n", + " consonant_data.append(phoneme)\n", + "\n", + "vowel_data.sort()\n", + "liquid_data.sort()\n", + "consonant_data.sort()\n", + "directory = os.path.dirname(dict_path)\n", + "\n", + "# make txt for language json file\n", + "vowel_txt_path = os.path.join(directory, \"vowels.txt\")\n", + "with open(vowel_txt_path, \"w\") as f:\n", + " f.write(\" \".join(vowel_data))\n", + "liquid_txt_path = os.path.join(directory, \"liquids.txt\")\n", + "with open(liquid_txt_path, \"w\") as f:\n", + " f.write(\" \".join(liquid_data))\n", + "consonant_txt_path = os.path.join(directory, \"consonants.txt\")\n", + "with open(consonant_txt_path, \"w\") as f:\n", + " f.write(\" \".join(consonant_data))\n", + "\n", + "\n", + "# here's a funny json append\n", + "with open(vowel_txt_path, \"r\") as f:\n", + " vowel_data = f.read().split()\n", + "with open(liquid_txt_path, \"r\") as f:\n", + " liquid_data = f.read().split()\n", + "with open(consonant_txt_path, \"r\") as f:\n", + " consonant_data = f.read().split()\n", + "phones4json = {\"vowels\": vowel_data, \"liquids\": liquid_data}\n", + "with open(\"/content/nnsvs-db-converter/lang.sample.json\", \"w\") as rawr:\n", + " json.dump(phones4json, rawr, indent=4)\n", + "\n", + "\n", + "if data_type == \"lab + wav (NNSVS format)\":\n", + " db_converter_script = \"/content/nnsvs-db-converter/db_converter.py\"\n", + " for raw_folder_name in os.listdir(all_shits_not_wav_n_lab):\n", + " raw_folder_path = os.path.join(all_shits_not_wav_n_lab, raw_folder_name)\n", + " if os.path.isdir(raw_folder_path):\n", + " !python {db_converter_script} -s {max_silence_phoneme_amount} -S 60 -l {segment_length} ${export_lab} -mD -c -L \"/content/nnsvs-db-converter/lang.sample.json\" -w htk --folder {raw_folder_path}\n", + "\n", + "if data_type == \"lab + wav (NNSVS format)\":\n", + " for raw_folder_name in os.listdir(all_shits_not_wav_n_lab):\n", + " raw_folder_path = os.path.join(all_shits_not_wav_n_lab, raw_folder_name)\n", + " !rm -rf {raw_folder_path}/*.wav {raw_folder_path}/*.lab\n", + " !mv {raw_folder_path}/diffsinger_db/* {raw_folder_path} 2> /dev/null\n", + " !rm -rf {raw_folder_path}/diffsinger_db\n", + " #!cp {raw_folder_path}/wavs/*.wav {raw_folder_path}\n", + "\n", + "# make it replace the first SP to AP cus it seems like people always forgot about it\n", + "for root, _, files in os.walk(all_shits_not_wav_n_lab):\n", + " for file in files:\n", + " if file.endswith(\".csv\"):\n", + " file_path = os.path.join(root, file)\n", + " with open(file_path, \"r\", newline=\"\") as input_file:\n", + " csv_reader = csv.reader(input_file)\n", + " data = [row for row in csv_reader]\n", + " header = data[0]\n", + " if \"ph_seq\" in header:\n", + " ph_seq_index = header.index(\"ph_seq\")\n", + " if len(data) > 1 and len(data[1]) > ph_seq_index:\n", + " data[1][ph_seq_index] = data[1][ph_seq_index].replace(\"SP\", \"AP\", 1)\n", + " with open(file_path, \"w\", newline=\"\") as output_file:\n", + " csv_writer = csv.writer(output_file)\n", + " csv_writer.writerows(data)\n", + "\n", + "print(\"extraction complete!\")\n", + "print(\"\\n\")\n", + "print(\"zipping up files...\")\n", + "!zip -q -9 -r {data_save_path}/data.zip /content/raw_data/*" + ], + "metadata": { + "cellView": "form", + "id": "AI7EQ2jQkGEq" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#@markdown # Build OpenUtau VB\n", + "#@markdown ___\n", + "#i need to clean this up it seems\n", + "#plan: add a build ou section here by inserting onnx paths (or just the folder containing the folders to the onnx files) to build ou\n", + "# ill have a config read function too so i dont have to add checkmark of if people train with embeds or shallow diff or not <3\n", + "# yes im lazy rawr x3\n", + "%cd /content\n", + "import os\n", + "import shutil\n", + "import yaml\n", + "from IPython.display import clear_output\n", + "\n", + "constr_folder = \"/content/OU_voicebank\"\n", + "if not os.path.exists(constr_folder):\n", + " os.makedirs(constr_folder)\n", + "else:\n", + " shutil.rmtree(constr_folder)\n", + " os.makedirs(constr_folder)\n", + "\n", + "clear_output()\n", + "\n", + "#@markdown path to your **ACOUSTIC ONNX FOLDER**\n", + "acoustic_onnx_folder = \"\" #@param{type:\"string'}\n", + "#@markdown path to the config.yaml of acoustic model\n", + "acoustic_config = \"\" #@param{type:\"string'}\n", + "\n", + "#@markdown path to your **VARIANCE ONNX FOLDER**\n", + "variance_onnx_folder = \"\" #@param{type:\"string'}\n", + "#@markdown path to the config.yaml of variance model\n", + "variance_config = \"\" #@param{type:\"string'}\n", + "\n", + "#@markdown path to your word to phoneme dict (leave blank to use default Japanese dict)\n", + "dictionary_path = \"\" #@param{type:\"string\"}\n", + "\n", + "#@markdown path to the folder you want to save the zip file to\n", + "save_path = \"\" #@param{type:\"string\"}\n", + "\n", + "#@markdown ___\n", + "\n", + "#@markdown ## Character Configuration | character.txt and character.yaml\n", + "\n", + "#@markdown your character display name| **required**\n", + "name = \"\" #@param{type:\"string\"}\n", + "\n", + "print(\"copying files...\")\n", + "main_stuff = f\"{constr_folder}/{name}\"\n", + "if not os.path.exists(main_stuff):\n", + " os.makedirs(main_stuff)\n", + "if not os.path.exists(f\"{main_stuff}/dsmain\"):\n", + " os.makedirs(f\"{main_stuff}/dsmain/embeds/acoustic\")\n", + " os.makedirs(f\"{main_stuff}/dsmain/embeds/variance\")\n", + "!cp {acoustic_onnx_folder}/acoustic.onnx {main_stuff}/dsmain\n", + "!cp {acoustic_onnx_folder}/phonemes.txt {main_stuff}/dsmain\n", + "!cp {acoustic_onnx_folder}/*.emb {main_stuff}/dsmain/embeds/acoustic >/dev/null 2>&1\n", + "!cp {variance_onnx_folder}/*.emb {main_stuff}/dsmain/embeds/variance >/dev/null 2>&1\n", + "\n", + "if variance_onnx_folder:\n", + " !cp {variance_onnx_folder}/linguistic.onnx {main_stuff}/dsmain\n", + "else:\n", + " pass\n", + "\n", + "print(\"\\n\")\n", + "print(\"writing character.txt...\")\n", + "with open(f\"{main_stuff}/character.txt\", \"w\") as file:\n", + " file.write(f\"name={name}\\n\")\n", + " file.write(\"image=\\n\")\n", + " file.write(\"author=\\n\")\n", + " file.write(\"voice=\\n\")\n", + " file.write(\"web=\\n\")\n", + "\n", + "print(\"\\n\")\n", + "print(\"writing character.yaml...\")\n", + "with open(f\"{main_stuff}/character.yaml\", \"w\") as file:\n", + " file.write(\"text_file_encoding: utf-8\\n\")\n", + " file.write(\"portrait:\\n\")\n", + " file.write(\"portrait_opacity: 0.45\\n\")\n", + " file.write(\"default_phonemizer: OpenUtau.Core.DiffSinger.DiffSingerPhonemizer\\n\")\n", + " file.write(\"singer_type: diffsinger\\n\")\n", + "acoustic_emb_files = os.listdir(acoustic_onnx_folder)\n", + "acoustic_embeds = []\n", + "acoustic_color_suffix = []\n", + "for file in acoustic_emb_files:\n", + " if file.endswith(\".emb\"):\n", + " acoustic_emb = os.path.splitext(file)[0]\n", + " acoustic_embeds.append(\"dsmain/embeds/acoustic/\" + acoustic_emb)\n", + " acoustic_color_suffix.append(acoustic_emb)\n", + "subbanks = []\n", + "for i, (acoustic_embed_color, acoustic_embed_suffix) in enumerate(zip(acoustic_color_suffix, acoustic_embeds), start=1):\n", + " color = f\"{i:02}: {acoustic_embed_color}\"\n", + " suffix = f\"{acoustic_embed_suffix}\"\n", + " subbanks.append({\"color\": color, \"suffix\": suffix})\n", + "if subbanks:\n", + " with open(f\"{main_stuff}/character.yaml\", \"r\") as config:\n", + " i_wanna_die_slash_j = yaml.safe_load(config)\n", + " i_wanna_die_slash_j[\"subbanks\"] = subbanks\n", + " with open(f\"{main_stuff}/character.yaml\", \"w\") as config:\n", + " yaml.dump(i_wanna_die_slash_j, config)\n", + "\n", + "print(\"\\n\")\n", + "print(\"writing dsconfig.yaml for acoustic...\")\n", + "with open(f\"{main_stuff}/dsconfig.yaml\", \"w\") as file:\n", + " file.write(\"phonemes: dsmain/phonemes.txt\\n\")\n", + " file.write(\"acoustic: dsmain/acoustic.onnx\\n\")\n", + " file.write(\"vocoder: nsf_hifigan\\n\")\n", + " file.write(\"singer_type: diffsinger\\n\")\n", + "with open(acoustic_config, \"r\") as config:\n", + " mfking_config = yaml.safe_load(config)\n", + "use_energy_embed = mfking_config.get(\"use_energy_embed\")\n", + "use_breathiness_embed = mfking_config.get(\"use_breathiness_embed\")\n", + "use_shallow_diffusion = mfking_config.get(\"use_shallow_diffusion\")\n", + "max_depth = mfking_config.get(\"T_start\")\n", + "speakers = mfking_config.get(\"speakers\") #looking back here, why is this even here lmao cus i used acoustic_embeds instead of speakers\n", + "augmentation_arg = mfking_config.get(\"augmentation_args\")\n", + "pitch_aug = mfking_config.get(\"use_key_shift_embed\")\n", + "time_aug = mfking_config.get(\"use_speed_embed\")\n", + "voicing = mfking_config.get(\"use_voicing_embed\")\n", + "tension = mfking_config.get(\"use_tension_embed\")\n", + "sample_rate = mfking_config.get(\"audio_sample_rate\")\n", + "hop_size = mfking_config.get(\"hop_size\")\n", + "win_size = mfking_config.get(\"win_size\")\n", + "fft_size = mfking_config.get(\"fft_size\")\n", + "num_mel_bins = mfking_config.get(\"audio_num_mel_bins\")\n", + "mel_fmin = mfking_config.get(\"fmin\")\n", + "mel_fmax = mfking_config.get(\"fmax\")\n", + "mel_base = mfking_config.get(\"mel_base\")\n", + "\n", + "with open(f\"{main_stuff}/dsconfig.yaml\", \"r\") as config:\n", + " why_are_there_so_many_i_could_prob_make_it_one = yaml.safe_load(config)\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"use_energy_embed\"] = use_energy_embed\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"use_breathiness_embed\"] = use_breathiness_embed\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"use_variable_depth\"] = use_shallow_diffusion\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"max_depth\"] = max_depth\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"augmentation_args\"] = augmentation_arg\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"use_key_shift_embed\"] = pitch_aug\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"use_speed_embed\"] = time_aug\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"use_voicing_embed\"] = voicing\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"use_tension_embed\"] = tension\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"use_continuous_acceleration\"] = True\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"sample_rate\"] = sample_rate\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"hop_size\"] = hop_size\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"win_size\"] = win_size\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"fft_size\"] = fft_size\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"num_mel_bins\"] = num_mel_bins\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"fmin\"] = mel_fmin\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"fmax\"] = mel_fmax\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"mel_base\"] = mel_base\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"mel_scale\"] = \"slaney\"\n", + "\n", + "\n", + "if subbanks:\n", + " why_are_there_so_many_i_could_prob_make_it_one[\"speakers\"] = acoustic_embeds\n", + "with open(f\"{main_stuff}/dsconfig.yaml\", \"w\") as config:\n", + " yaml.dump(why_are_there_so_many_i_could_prob_make_it_one, config)\n", + "\n", + "\n", + "variance_emb_files = os.listdir(variance_onnx_folder)\n", + "variance_embeds = []\n", + "for file in variance_emb_files:\n", + " if file.endswith(\".emb\"):\n", + " variance_emb = os.path.splitext(file)[0]\n", + " variance_embeds.append(\"../dsmain/embeds/variance/\" + variance_emb)\n", + "\n", + "print(\"\\n\")\n", + "print(\"writing dsdict.yaml...\")\n", + "if not dictionary_path:\n", + " dict_path = \"/content/jpn_dict.txt\"\n", + "else:\n", + " dict_path = dictionary_path\n", + "\n", + "# for symbols list\n", + "phoneme_dict_path = f\"{acoustic_onnx_folder}/dictionary.txt\"\n", + "\n", + "dsdict = \"dsdict.yaml\"\n", + "\n", + "def parse_phonemes(phonemes_str):\n", + " return phonemes_str.split()\n", + "\n", + "entries = []\n", + "vowel_types = {\"a\", \"i\", \"u\", \"e\", \"o\", \"N\", \"M\", \"NG\", \"cl\", \"vf\"}\n", + "vowel_data = []\n", + "stop_data = []\n", + "\n", + "# Process the specified dictionary\n", + "with open(dict_path, \"r\") as f:\n", + " for line in f:\n", + " word, phonemes_str = line.strip().split(\"\\t\")\n", + " phonemes = parse_phonemes(phonemes_str)\n", + " if len(phonemes) == 1:\n", + " entries.append({\"grapheme\": word, \"phonemes\": phonemes})\n", + " else:\n", + " entries.append({\"grapheme\": word, \"phonemes\": phonemes})\n", + "\n", + "with open(phoneme_dict_path, \"r\") as f:\n", + " for line in f:\n", + " phoneme, _ = line.strip().split(\"\\t\")\n", + " phoneme_type = \"vowel\" if phoneme[0] in vowel_types else \"stop\"\n", + " entry = {\"symbol\": phoneme, \"type\": phoneme_type}\n", + " if phoneme_type == \"vowel\":\n", + " vowel_data.append(entry)\n", + " else:\n", + " stop_data.append(entry)\n", + "\n", + "vowel_data.sort(key=lambda x: x[\"symbol\"])\n", + "stop_data.sort(key=lambda x: x[\"symbol\"])\n", + "\n", + "dsdict_path = os.path.join(constr_folder, dsdict)\n", + "with open(dsdict_path, \"w\") as f:\n", + " f.write(\"entries:\\n\")\n", + " for entry in entries:\n", + " f.write(f\"- grapheme: {entry['grapheme']}\\n\")\n", + " f.write(\" phonemes:\\n\")\n", + " for phoneme in entry[\"phonemes\"]:\n", + " f.write(f\" - {phoneme}\\n\")\n", + "\n", + " f.write(\"\\nsymbols:\\n\")\n", + " for entry in vowel_data + stop_data:\n", + " f.write(f\"- symbol: {entry['symbol']}\\n\")\n", + " f.write(f\" type: {entry['type']}\\n\")\n", + "\n", + "with open(variance_config, \"r\") as config:\n", + " mfking_config = yaml.safe_load(config)\n", + "sample_rate = mfking_config.get(\"audio_sample_rate\")\n", + "hop_size = mfking_config.get(\"hop_size\")\n", + "predict_dur = mfking_config.get(\"predict_dur\")\n", + "predict_pitch = mfking_config.get(\"predict_pitch\")\n", + "use_melody_encoder = mfking_config.get(\"use_melody_encoder\")\n", + "predict_voicing = mfking_config.get(\"predict_voicing\")\n", + "predict_tension = mfking_config.get(\"predict_tension\")\n", + "predict_energy = mfking_config.get(\"predict_energy\")\n", + "predict_breathiness = mfking_config.get(\"predict_breathiness\")\n", + "\n", + "dur_onnx_path = variance_onnx_folder + \"/dur.onnx\"\n", + "if os.path.exists(dur_onnx_path):\n", + " print(\"\\n\")\n", + " print(\"making dsdur directory and necessary files...\")\n", + " os.makedirs(f\"{main_stuff}/dsdur\")\n", + " !cp {dur_onnx_path} {main_stuff}/dsdur\n", + " !cp {dsdict_path} {main_stuff}/dsdur\n", + " with open(f\"{main_stuff}/dsdur/dsconfig.yaml\", \"w\") as file:\n", + " file.write(\"phonemes: ../dsmain/phonemes.txt\\n\")\n", + " file.write(\"linguistic: ../dsmain/linguistic.onnx\\n\")\n", + " file.write(\"dur: dur.onnx\\n\")\n", + " with open(f\"{main_stuff}/dsdur/dsconfig.yaml\", \"r\") as config:\n", + " dsdur_config = yaml.safe_load(config)\n", + " dsdur_config[\"use_continuous_acceleration\"] = True\n", + " dsdur_config[\"sample_rate\"] = sample_rate\n", + " dsdur_config[\"hop_size\"] = hop_size\n", + " dsdur_config[\"predict_dur\"] = predict_dur\n", + " if subbanks:\n", + " dsdur_config[\"speakers\"] = variance_embeds\n", + " with open(f\"{main_stuff}/dsdur/dsconfig.yaml\", \"w\") as config:\n", + " yaml.dump(dsdur_config, config)\n", + "else:\n", + " print(\"\\n\")\n", + " print(\"dur.onnx not found, skipping on making dsdur folder...\")\n", + "\n", + "pitch_onnx_path = variance_onnx_folder + \"/pitch.onnx\"\n", + "if os.path.exists(pitch_onnx_path):\n", + " print(\"\\n\")\n", + " print(\"making dspitch directory and necessary files...\")\n", + " os.makedirs(f\"{main_stuff}/dspitch\")\n", + " !cp {pitch_onnx_path} {main_stuff}/dspitch\n", + " !cp {dsdict_path} {main_stuff}/dspitch\n", + " with open(f\"{main_stuff}/dspitch/dsconfig.yaml\", \"w\") as file:\n", + " file.write(\"phonemes: ../dsmain/phonemes.txt\\n\")\n", + " file.write(\"linguistic: ../dsmain/linguistic.onnx\\n\")\n", + " file.write(\"pitch: pitch.onnx\\n\")\n", + " file.write(\"use_expr: true\\n\")\n", + " with open(f\"{main_stuff}/dspitch/dsconfig.yaml\", \"r\") as config:\n", + " dspitch_config = yaml.safe_load(config)\n", + " dspitch_config[\"use_continuous_acceleration\"] = True\n", + " dspitch_config[\"sample_rate\"] = sample_rate\n", + " dspitch_config[\"hop_size\"] = hop_size\n", + " dspitch_config[\"predict_dur\"] = predict_pitch\n", + " if subbanks:\n", + " dspitch_config[\"speakers\"] = variance_embeds\n", + " dspitch_config[\"use_note_rest\"] = use_melody_encoder\n", + " with open(f\"{main_stuff}/dspitch/dsconfig.yaml\", \"w\") as config:\n", + " yaml.dump(dspitch_config, config)\n", + "else:\n", + " print(\"\\n\")\n", + " print(\"pitch.onnx not found, skipping on making dspitch folder...\")\n", + "\n", + "variance_onnx_path = variance_onnx_folder + \"/variance.onnx\"\n", + "if os.path.exists(variance_onnx_path):\n", + " print(\"\\n\")\n", + " print(\"making dsvariance directory and necessary files...\")\n", + " os.makedirs(f\"{main_stuff}/dsvariance\")\n", + " !cp {variance_onnx_path} {main_stuff}/dsvariance\n", + " !cp {dsdict_path} {main_stuff}/dsvariance\n", + " with open(f\"{main_stuff}/dsvariance/dsconfig.yaml\", \"w\") as file:\n", + " file.write(\"phonemes: ../dsmain/phonemes.txt\\n\")\n", + " file.write(\"linguistic: ../dsmain/linguistic.onnx\\n\")\n", + " file.write(\"variance: variance.onnx\\n\")\n", + " with open(f\"{main_stuff}/dsvariance/dsconfig.yaml\", \"r\") as config:\n", + " dsvariance_config = yaml.safe_load(config)\n", + " dsvariance_config[\"use_continuous_acceleration\"] = True\n", + " dsvariance_config[\"sample_rate\"] = sample_rate\n", + " dsvariance_config[\"hop_size\"] = hop_size\n", + " dsvariance_config[\"predict_dur\"] = True #this one will always be true cus if there's no variance model, it shouldnt make this folder in the first place\n", + " dsvariance_config[\"predict_voicing\"] = predict_voicing\n", + " dsvariance_config[\"predict_tension\"] = predict_tension\n", + " dsvariance_config[\"predict_energy\"] = predict_energy\n", + " dsvariance_config[\"predict_breathiness\"] = predict_breathiness\n", + " if subbanks:\n", + " dsvariance_config[\"speakers\"] = variance_embeds\n", + " with open(f\"{main_stuff}/dsvariance/dsconfig.yaml\", \"w\") as config:\n", + " yaml.dump(dsvariance_config, config)\n", + "else:\n", + " print(\"\\n\")\n", + " print(\"variance.onnx not found, skipping on making dsvariance folder...\")\n", + "\n", + "!rm -rf {dsdict_path}\n", + "#im too lazy to write codes so ill just do this, itll only remove those folders if they're empty anyway\n", + "!rm -d {main_stuff}/dsmain/embeds/* >/dev/null 2>&1\n", + "!rm -d {main_stuff}/dsmain/embeds >/dev/null 2>&1\n", + "\n", + "print(\"\\n\")\n", + "print(\"zipping up files...\")\n", + "!zip -q -9 -r {save_path}/{name}.zip {main_stuff}/*\n", + "\n", + "print(\"\\n\")\n", + "print(\"done!\")\n", + "\n", + "print(\"\\n\")\n", + "print(\"You can download your model zip and use it in OpenUtau! If anything needed to be edit in the config then please do so\")" + ], + "metadata": { + "cellView": "form", + "id": "A70Sc3Hbmxh0" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/OpenUtau.Plugin.Builtin/Data/ThaiArpasingBydeltaVOCALOID b/OpenUtau.Plugin.Builtin/Data/ThaiArpasingBydeltaVOCALOID new file mode 100644 index 000000000..46ab5d94d --- /dev/null +++ b/OpenUtau.Plugin.Builtin/Data/ThaiArpasingBydeltaVOCALOID @@ -0,0 +1,61 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using OpenUtau.Api; +using Serilog; + +namespace OpenUtau.Plugin.Builtin { + /// + /// The Thai Arpasing Phonemizer. + /// + /// Arpasing is a system that uses CMUdict as dictionary to convert Thai words to phoneme symbols. + /// See http://www.speech.cs.cmu.edu/cgi-bin/cmudict and https://arpasing.neocities.org/en/faq.html. + /// + /// + [Phonemizer("Thai Arpasing Phonemizer", "THAI ARPA")] + public class ArpasingPhonemizer : LatinDiphonePhonemizer { + public ArpasingPhonemizer() { + try { + Initialize(); + } catch (Exception e) { + Log.Error(e, "Failed to initialize."); + } + } + + protected override IG2p LoadG2p() { + var g2ps = new List(); + + // Load dictionary from plugin folder. + string path = Path.Combine(PluginDir, "Thai arpasing.yaml"); + if (!File.Exists(path)) { + Directory.CreateDirectory(PluginDir); + File.WriteAllBytes(path, Data.Resources.arpasing_template); + } + g2ps.Add(G2pDictionary.NewBuilder().Load(File.ReadAllText(path)).Build()); + + // Load dictionary from singer folder. + if (singer != null && singer.Found && singer.Loaded) { + string file = Path.Combine(singer.Location, "arpasing.yaml"); + if (File.Exists(file)) { + try { + g2ps.Add(G2pDictionary.NewBuilder().Load(File.ReadAllText(file)).Build()); + } catch (Exception e) { + Log.Error(e, $"Failed to load {file}"); + } + } + } + + // Load base g2p. + g2ps.Add(new ArpabetG2p()); + + return new G2pFallbacks(g2ps.ToArray()); + } + + protected override Dictionary LoadVowelFallbacks() { + return "aa=ah,ae;ae=ah,aa;ah=aa,ae;ao=ow;ow=ao;eh=ae;ih=iy;iy=ih;uh=uw;uw=uh;aw=ao".Split(';') + .Select(entry => entry.Split('=')) + .ToDictionary(parts => parts[0], parts => parts[1].Split(',')); + } + } +} diff --git a/OpenUtau.Plugin.Builtin/Thai_VCCV_2025_Phonemizer.cs b/OpenUtau.Plugin.Builtin/Thai_VCCV_2025_Phonemizer.cs new file mode 100644 index 000000000..4066983fd --- /dev/null +++ b/OpenUtau.Plugin.Builtin/Thai_VCCV_2025_Phonemizer.cs @@ -0,0 +1,338 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Linq; +using System.Text.RegularExpressions; +using Melanchall.DryWetMidi.Interaction; +using OpenUtau.Api; +using OpenUtau.Classic; +using OpenUtau.Core.Ustx; +using Serilog; + +namespace OpenUtau.Plugin.Builtin { + [Phonemizer("Thai VCCV Phonemizer", "TH VCCV", "PRINTmov", language: "TH")] + public class ThaiVCCVPhonemizer : Phonemizer { + + readonly string[] vowels = new string[] { + "a", "i", "u", "e", "o", "@", "Q", "3", "6", "1", "ia", "ua", "I", "8" + }; + + readonly string[] diphthongs = new string[] { + "r", "l", "w" + }; + + readonly string[] consonants = new string[] { + "b", "ch", "d", "f", "g", "h", "j", "k", "kh", "l", "m", "n", "p", "ph", "r", "s", "t", "th", "w", "y" + }; + + readonly string[] endingConsonants = new string[] { + "b", "ch", "d", "f", "g", "h", "j", "k", "kh", "l", "m", "n", "p", "ph", "r", "s", "t", "th", "w", "y" + }; + + private readonly Dictionary VowelMapping = new Dictionary { + {"เcือะ", "6"}, {"เcือx", "6"}, {"แcะ", "@"}, {"แcx", "@"}, {"เcอะ", "3"}, {"เcอ", "3"}, {"ไc", "I"}, {"ใc", "I"}, {"เcาะ", "Q"}, {"cอx", "Q"}, + {"cืx", "1"}, {"cึx", "1"}, {"cือ", "1"}, {"cะ", "a"}, {"cัx", "a"}, {"cาx", "a"}, {"เcา", "8"}, {"เcะ", "e"}, {"เcx", "e"}, {"cิx", "i"}, {"cีx", "i"}, + {"เcียะ", "ia"}, {"เcียx", "ia"}, {"โcะ", "o"}, {"โcx", "o"}, {"cุx", "u"}, {"cูx", "u"}, {"cัวะ", "ua"}, {"cัว", "ua"}, {"cำ", "am"}, {"เcิx", "3"}, {"เcิ", "3"} + }; + + private readonly Dictionary CMapping = new Dictionary { + {'-', ""}, {'อ', ""}, // เพิ่ม - และ อ ให้แมปเป็นค่าว่าง เพื่อลากเข้าสระได้เลย + {'ก', "k"}, {'ข', "kh"}, {'ค', "kh"}, {'ฆ', "kh"}, {'ฅ', "kh"}, {'ฃ', "kh"}, + {'จ', "j"}, {'ฉ', "ch"}, {'ช', "ch"}, {'ฌ', "ch"}, + {'ฎ', "d"}, {'ด', "d"}, + {'ต', "t"}, {'ฏ', "t"}, + {'ถ', "th"}, {'ฐ', "th"}, {'ฑ', "th"}, {'ธ', "th"}, {'ท', "th"}, + {'บ', "b"}, {'ป', "p"}, {'พ', "ph"}, {'ผ', "ph"}, {'ภ', "ph"}, {'ฟ', "f"}, {'ฝ', "f"}, + {'ห', "h"}, {'ฮ', "h"}, + {'ม', "m"}, {'น', "n"}, {'ณ', "n"}, {'ร', "r"}, {'ล', "l"}, {'ฤ', "r"}, + {'ส', "s"}, {'ศ', "s"}, {'ษ', "s"}, {'ซ', "s"}, + {'ง', "g"}, {'ย', "y"}, {'ญ', "y"}, {'ว', "w"}, {'ฬ', "r"} + }; + + private readonly Dictionary XMapping = new Dictionary { + {'บ', "b"}, {'ป', "b"}, {'พ', "b"}, {'ฟ', "b"}, {'ภ', "b"}, + {'ด', "d"}, {'จ', "d"}, {'ช', "d"}, {'ซ', "d"}, {'ฎ', "d"}, {'ฏ', "d"}, {'ฐ', "d"}, + {'ฑ', "d"}, {'ฒ', "d"}, {'ต', "d"}, {'ถ', "d"}, {'ท', "d"}, {'ธ', "d"}, {'ศ', "d"}, {'ษ', "d"}, {'ส', "d"}, + {'ก', "k"}, {'ข', "k"}, {'ค', "k"}, {'ฆ', "k"}, + {'ว', "w"}, + {'ย', "y"}, + {'น', "n"}, {'ญ', "n"}, {'ณ', "n"}, {'ร', "n"}, {'ล', "n"}, {'ฬ', "n"}, + {'ง', "g"}, + {'ม', "m"} + }; + + private USinger singer; + public override void SetSinger(USinger singer) => this.singer = singer; + + private bool checkOtoUntilHit(string[] input, Note note, out UOto oto) { + oto = default; + var attr = note.phonemeAttributes?.FirstOrDefault(attr => attr.index == 0) ?? default; + + foreach (string test in input) { + if (singer.TryGetMappedOto(test, note.tone + attr.toneShift, attr.voiceColor, out var otoCandidacy)) { + oto = otoCandidacy; + return true; + } + } + return false; + } + + public override Result Process(Note[] notes, Note? prev, Note? next, Note? prevNeighbour, Note? nextNeighbour, Note[] prevNeighbours) { + var note = notes[0]; + var currentLyric = note.lyric.Normalize(); + if (!string.IsNullOrEmpty(note.phoneticHint)) { + currentLyric = note.phoneticHint.Normalize(); + } + + var phonemes = new List(); + List tests = new List(); + + string prevTemp = ""; + if (prevNeighbour != null) { + prevTemp = prevNeighbour.Value.lyric; + } + var prevTh = ParseInput(prevTemp); + var noteTh = ParseInput(currentLyric); + + if (noteTh.Consonant != null && noteTh.Dipthong == null && noteTh.Vowel != null) { + if (checkOtoUntilHit(new string[] { noteTh.Consonant + noteTh.Vowel }, note, out var tempOto)) { + tests.Add(tempOto.Alias); + } + } else if (noteTh.Consonant != null && noteTh.Dipthong != null && noteTh.Vowel != null) { + if (checkOtoUntilHit(new string[] { noteTh.Consonant + noteTh.Dipthong + noteTh.Vowel }, note, out var tempOto)) { + tests.Add(tempOto.Alias); + } else { + if (checkOtoUntilHit(new string[] { noteTh.Consonant + noteTh.Dipthong }, note, out tempOto)) { + tests.Add(tempOto.Alias); + } + if (checkOtoUntilHit(new string[] { noteTh.Dipthong + noteTh.Vowel }, note, out tempOto)) { + tests.Add(tempOto.Alias); + } + } + } + + if (noteTh.Consonant == null && noteTh.Vowel != null) { + if (prevTh.EndingConsonant != null && checkOtoUntilHit(new string[] { prevTh.EndingConsonant + noteTh.Vowel }, note, out var tempOto)) { + tests.Add(tempOto.Alias); + } else if (prevTh.Vowel != null && checkOtoUntilHit(new string[] { prevTh.Vowel + noteTh.Vowel }, note, out tempOto)) { + tests.Add(tempOto.Alias); + } else if (checkOtoUntilHit(new string[] { noteTh.Vowel }, note, out tempOto)) { + tests.Add(tempOto.Alias); + } + } + + if (noteTh.EndingConsonant != null && noteTh.Vowel != null) { + if (checkOtoUntilHit(new string[] { noteTh.Vowel + noteTh.EndingConsonant }, note, out var tempOto)) { + tests.Add(tempOto.Alias); + } + } else if (nextNeighbour != null && noteTh.Vowel != null) { + var nextTh = ParseInput(nextNeighbour.Value.lyric); + if (checkOtoUntilHit(new string[] { noteTh.Vowel + " " + nextTh.Consonant }, note, out var tempOto)) { + tests.Add(tempOto.Alias); + } + } + + if (prevNeighbour == null && tests.Count >= 1) { + if (checkOtoUntilHit(new string[] { "-" + tests[0] }, note, out var tempOto)) { + tests[0] = (tempOto.Alias); + } + } + + if (nextNeighbour == null && tests.Count >= 1) { + if (noteTh.EndingConsonant == null) { + if (checkOtoUntilHit(new string[] { noteTh.Vowel + "-" }, note, out var tempOto)) { + tests.Add(tempOto.Alias); + } + } else { + if (checkOtoUntilHit(new string[] { tests[tests.Count - 1] + "-" }, note, out var tempOto)) { + tests[tests.Count - 1] = (tempOto.Alias); + } + } + } + + if (tests.Count <= 0) { + if (checkOtoUntilHit(new string[] { currentLyric }, note, out var tempOto)) { + tests.Add(currentLyric); + } + } + + if (checkOtoUntilHit(tests.ToArray(), note, out var oto)) { + var noteDuration = notes.Sum(n => n.duration); + for (int i = 0; i < tests.ToArray().Length; i++) { + int position = 0; + int vcPosition = noteDuration - 120; + + if (nextNeighbour != null && tests[i].Contains(" ")) { + var nextLyric = nextNeighbour.Value.lyric.Normalize(); + if (!string.IsNullOrEmpty(nextNeighbour.Value.phoneticHint)) { + nextLyric = nextNeighbour.Value.phoneticHint.Normalize(); + } + var nextTh = ParseInput(nextLyric); + var nextCheck = nextTh.Vowel; + if (nextTh.Consonant != null) { + nextCheck = nextTh.Consonant + nextTh.Vowel; + } + if (nextTh.Dipthong != null) { + nextCheck = nextTh.Consonant + nextTh.Dipthong + nextTh.Vowel; + } + var nextAttr = nextNeighbour.Value.phonemeAttributes?.FirstOrDefault(attr => attr.index == 0) ?? default; + if (singer.TryGetMappedOto(nextCheck, nextNeighbour.Value.tone + nextAttr.toneShift, nextAttr.voiceColor, out var nextOto)) { + if (oto.Overlap > 0) { + vcPosition = noteDuration - MsToTick(nextOto.Overlap) - MsToTick(nextOto.Preutter); + } + } + } + + if (noteTh.Dipthong == null || tests.Count <= 2) { + if (i == 1) { + position = Math.Max((int)(noteDuration * 0.75), vcPosition); + } + } else { + if (i == 1) { + position = Math.Min((int)(noteDuration * 0.1), 60); + } else if (i == 2) { + position = Math.Max((int)(noteDuration * 0.75), vcPosition); + } + } + + phonemes.Add(new Phoneme { phoneme = tests[i], position = position }); + } + } + + return new Result { + phonemes = phonemes.ToArray() + }; + } + + (string Consonant, string Dipthong, string Vowel, string EndingConsonant) ParseInput(string input) { + input = WordToPhonemes(input); + + string consonant = null; + string diphthong = null; + string vowel = null; + string endingConsonant = null; + + if (input == null) { + return (null, null, null, null); + } + + foreach (var con in consonants) { + if (input.StartsWith(con)) { + if (consonant == null || consonant.Length < con.Length) { + consonant = con; + } + } + } + + int startIdx = consonant?.Length ?? 0; + foreach (var dip in diphthongs) { + if (input.Substring(startIdx).StartsWith(dip)) { + if (diphthong == null || diphthong.Length < dip.Length) { + diphthong = dip; + } + } + } + + startIdx += diphthong?.Length ?? 0; + foreach (var vow in vowels) { + if (input.Substring(startIdx).StartsWith(vow)) { + if (vowel == null || vowel.Length < vow.Length) { + vowel = vow; + } + } + } + + foreach (var con in endingConsonants) { + if (input.EndsWith(con)) { + if (endingConsonant == null || endingConsonant.Length < con.Length) { + endingConsonant = con; + } + } + } + + return (consonant, diphthong, vowel, endingConsonant); + } + + public string WordToPhonemes(string input) { + input = input.Replace(" ", ""); + input = RemoveInvalidLetters(input); + + // เพิ่มการตรวจจับ '-' เพื่อไม่ให้โค้ดข้ามการประมวลผลไป + if (!Regex.IsMatch(input, "[ก-ฮ-]")) { + return input; + } + + foreach (var mapping in VowelMapping) { + // อัปเดต Regex บล็อกพยัญชนะต้น (c) ให้จับเฉพาะคำควบกล้ำที่มีอยู่จริง เพื่อกันปัญหาดึงตัวสะกดไปควบกล้ำ (เช่น แล้ว เป็น แว้) + string pattern = "^" + mapping.Key + .Replace("c", "(ก[รลว]|ข[รลว]|ค[รลว]|ต[รลว]|ป[รล]|พ[รลว]|ฟ[รล]|บ[รล]|ด[ร]|ผล|ทร|ศร|สร|ห[ก-ฮ]|อย|[ก-ฮ-])") + .Replace("x", "([ก-ฮ]?)") + "$"; + + var match = Regex.Match(input, pattern); + if (match.Success) { + string c = match.Groups[1].Value; + string x = match.Groups.Count > 2 ? match.Groups[2].Value : string.Empty; + if (c.Length >= 2 && (c.StartsWith("ห") || c.StartsWith("อ"))) { + c = c.Substring(1); + } + string cConverted = ConvertC(c); + string xConverted = ConvertX(x); + + if (mapping.Value == "a" && input.Contains("ั") && x == "ว") { + return cConverted + "ua"; + } + if (mapping.Value == "e" && x == "ย") { + return cConverted + "3" + xConverted; + } + return cConverted + mapping.Value + xConverted; + } + } + if (input.Length == 1) { + return ConvertC(input) + "Q"; + } else if (input.Length == 2) { + return ConvertC(input[0].ToString()) + "o" + ConvertX(input[1].ToString()); + } else if (input.Length == 3) { + if (input[1] == 'ว') { + return ConvertC(input[0].ToString()) + "ua" + ConvertX(input[2].ToString()); + } else { + return ConvertC(input.Substring(0, 2).ToString()) + "o" + ConvertX(input[1].ToString()); + } + } else if (input.Length == 4) { + if (input[2] == 'ว') { + return ConvertC(input.Substring(0, 2).ToString()) + "ua" + ConvertX(input[3].ToString()); + } + } + return input; + } + + private string ConvertC(string input) { + if (string.IsNullOrEmpty(input)) return input; + char firstChar = input[0]; + char? secondChar = input.Length > 1 ? input[1] : (char?)null; + if (CMapping.ContainsKey(firstChar)) { + string firstCharConverted = CMapping[firstChar]; + if (secondChar != null && CMapping.ContainsKey((char)secondChar)) { + return firstCharConverted + CMapping[(char)secondChar]; + } + return firstCharConverted; + } + return input; + } + + private string ConvertX(string input) { + if (string.IsNullOrEmpty(input)) return input; + char firstChar = input[0]; + if (XMapping.ContainsKey(firstChar)) { + return XMapping[firstChar]; + } + return input; + } + + private string RemoveInvalidLetters(string input) { + input = Regex.Replace(input, ".์", ""); + input = Regex.Replace(input, "[่้๊๋็]", ""); + return input; + } + + } +}