From 673072b939366382daf5bf7faa9fbc5892e2483b Mon Sep 17 00:00:00 2001 From: DELTA VOCALOID <105579737+deltaVOCALOID09378@users.noreply.github.com> Date: Wed, 29 Nov 2023 19:16:55 +0700 Subject: [PATCH 01/12] Create ThaiArpasingBydeltaVOCALOID develop by deltaVOCALOID --- .../Data/ThaiArpasingBydeltaVOCALOID | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 OpenUtau.Plugin.Builtin/Data/ThaiArpasingBydeltaVOCALOID diff --git a/OpenUtau.Plugin.Builtin/Data/ThaiArpasingBydeltaVOCALOID b/OpenUtau.Plugin.Builtin/Data/ThaiArpasingBydeltaVOCALOID new file mode 100644 index 000000000..46ab5d94d --- /dev/null +++ b/OpenUtau.Plugin.Builtin/Data/ThaiArpasingBydeltaVOCALOID @@ -0,0 +1,61 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using OpenUtau.Api; +using Serilog; + +namespace OpenUtau.Plugin.Builtin { + /// + /// The Thai Arpasing Phonemizer. + /// + /// Arpasing is a system that uses CMUdict as dictionary to convert Thai words to phoneme symbols. + /// See http://www.speech.cs.cmu.edu/cgi-bin/cmudict and https://arpasing.neocities.org/en/faq.html. + /// + /// + [Phonemizer("Thai Arpasing Phonemizer", "THAI ARPA")] + public class ArpasingPhonemizer : LatinDiphonePhonemizer { + public ArpasingPhonemizer() { + try { + Initialize(); + } catch (Exception e) { + Log.Error(e, "Failed to initialize."); + } + } + + protected override IG2p LoadG2p() { + var g2ps = new List(); + + // Load dictionary from plugin folder. + string path = Path.Combine(PluginDir, "Thai arpasing.yaml"); + if (!File.Exists(path)) { + Directory.CreateDirectory(PluginDir); + File.WriteAllBytes(path, Data.Resources.arpasing_template); + } + g2ps.Add(G2pDictionary.NewBuilder().Load(File.ReadAllText(path)).Build()); + + // Load dictionary from singer folder. + if (singer != null && singer.Found && singer.Loaded) { + string file = Path.Combine(singer.Location, "arpasing.yaml"); + if (File.Exists(file)) { + try { + g2ps.Add(G2pDictionary.NewBuilder().Load(File.ReadAllText(file)).Build()); + } catch (Exception e) { + Log.Error(e, $"Failed to load {file}"); + } + } + } + + // Load base g2p. + g2ps.Add(new ArpabetG2p()); + + return new G2pFallbacks(g2ps.ToArray()); + } + + protected override Dictionary LoadVowelFallbacks() { + return "aa=ah,ae;ae=ah,aa;ah=aa,ae;ao=ow;ow=ao;eh=ae;ih=iy;iy=ih;uh=uw;uw=uh;aw=ao".Split(';') + .Select(entry => entry.Split('=')) + .ToDictionary(parts => parts[0], parts => parts[1].Split(',')); + } + } +} From 5534f44dd628e0c4e2fae764196e028dbe1505a5 Mon Sep 17 00:00:00 2001 From: DELTA SYNTH <105579737+deltaVOCALOID09378@users.noreply.github.com> Date: Fri, 12 Sep 2025 23:39:53 +0700 Subject: [PATCH 02/12] Thai CVVC-VCCV Edit By DELTA SYNTH MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Because we found some problem It's about อ set in Thai Language in every using. It's make I feel annoy so much My Teacher have so busy. I make by myselft. --- .../Thai_VCCV_2025_Phonemizer.cs | 233 ++++++++++++++++++ 1 file changed, 233 insertions(+) create mode 100644 OpenUtau.Plugin.Builtin/Thai_VCCV_2025_Phonemizer.cs diff --git a/OpenUtau.Plugin.Builtin/Thai_VCCV_2025_Phonemizer.cs b/OpenUtau.Plugin.Builtin/Thai_VCCV_2025_Phonemizer.cs new file mode 100644 index 000000000..853873160 --- /dev/null +++ b/OpenUtau.Plugin.Builtin/Thai_VCCV_2025_Phonemizer.cs @@ -0,0 +1,233 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Linq; +using Melanchall.DryWetMidi.Interaction; +using OpenUtau.Api; +using OpenUtau.Classic; +using OpenUtau.Core.Ustx; +using Serilog; + +namespace OpenUtau.Plugin.Builtin { + [Phonemizer("Thai2 CVVC&VCCV Phonemizer", "TH2 CVVC&VCCV", " Phonemizer by Ferina, PRINTmov and DELTA SYNTH ", language: "TH")] + public class Thai2 CVVC&VCCV Phonemizer : TH2 Phonemizer { + static readonly string[] vowels = new string[] { + "a", "i", "u", "e", "o", "@", "Q", "3", "6", "1", "ia", "ua", "I", "8" + }; + + static readonly string[] diphthongs = new string[] { + "r", "l","w", "y" + }; + + static readonly string[] consonants = new string[] { + "b", "ch", "d", "f", "g", "h", "j", "k", "kh", "l", "m", "n", "p", "ph", "r", "s", "t", "th", "w", "y", "-" + }; + + static readonly string[] endingConsonants = new string[] { + "b", "ch", "d", "f", "g", "h", "j", "k", "kh", "l", "m", "n", "p", "ph", "r", "s", "t", "th", "w", "y", "-" + }; + + private USinger singer; + public override void SetSinger(Your's Singer Name) => this.singer = singer; + + private bool checkOtoUntilHit(string[] input, Note note, out UOto oto) { + oto = default; + var attr = note.phonemeAttributes?.FirstOrDefault(attr => attr.index == 0) ?? default; + + foreach (string test in input) { + if (singer.TryGetMappedOto(test, note.tone + attr.toneShift, attr.voiceColor, out var otoCandidacy)) { + oto = otoCandidacy; + return true; + } + } + return false; + } + + public override Result Process(Note[] notes, Note? prev, Note? next, Note? prevNeighbour, Note? nextNeighbour, Note[] prevNeighbours) { + var note = notes[0]; + var currentLyric = note.lyric.Normalize(); + if (!string.IsNullOrEmpty(note.phoneticHint)) { + currentLyric = note.phoneticHint.Normalize(); + } + + var phonemes = new List(); + + List tests = new List(); + + string prevTemp = ""; + if (prevNeighbour != null) { + prevTemp = prevNeighbour.Value.lyric; + } + var prevTh = ParseInput(prevTemp); + + var noteTh = ParseInput(currentLyric); + + if (noteTh.Consonant != null && noteTh.Dipthong == null && noteTh.Vowel != null) { + if (checkOtoUntilHit(new string[] { noteTh.Consonant + noteTh.Dipthong + noteTh.Vowel }, note, out var tempOto)) { + tests.Add(tempOto.Alias); + } + } else if (noteTh.Consonant != null && noteTh.Dipthong != null && noteTh.Vowel != null && noteTh.endingConsonants != null ) { + if (checkOtoUntilHit(new string[] { noteTh.Consonant + noteTh.Dipthong + noteTh.Vowel + noteTh.endingConsonants}, note, out var tempOto)) { + tests.Add(tempOto.Alias); + } else { + if (checkOtoUntilHit(new string[] { noteTh.Consonant + noteTh.Dipthong }, note, out var tempOto)) { + tests.Add(tempOto.Alias); + } + if (checkOtoUntilHit(new string[] { noteTh.Dipthong + noteTh.Vowel }, note, out tempOto)) { + tests.Add(tempOto.Alias); + } + } + } + + if (noteTh.Consonant == null && noteTh.Vowel != null) { + if (prevTh.EndingConsonant != null && checkOtoUntilHit(new string[] { prevTh.EndingConsonant + noteTh.Vowel }, note, out var tempOto)) { + tests.Add(tempOto.Alias); + } else if (prevTh.Vowel != null && checkOtoUntilHit(new string[] { prevTh.Vowel + noteTh.Vowel }, note, out tempOto)) { + tests.Add(tempOto.Alias); + } else if (checkOtoUntilHit(new string[] { noteTh.Vowel }, note, out tempOto)) { + tests.Add(tempOto.Alias); + } + } + + if (noteTh.EndingConsonant != null && noteTh.Vowel != null) { + if (checkOtoUntilHit(new string[] { noteTh.Vowel + noteTh.EndingConsonant }, note, out var tempOto)) { + tests.Add(tempOto.Alias); + } + } else if (nextNeighbour != null && noteTh.Vowel != null) { + var nextTh = ParseInput(nextNeighbour.Value.lyric); + if (checkOtoUntilHit(new string[] noteTh.Consonant { " " + nextTh.Vowel }, note, out var tempOto)) { + tests.Add(tempOto.Alias); + } + } + + if (prevNeighbour == null && tests.Count >= 1) { + if (checkOtoUntilHit(new string[] { "-" + tests[0] }, note, out var tempOto)) { + tests[0] = (tempOto.Alias); + } + } + + if (nextNeighbour == null && tests.Count >= 1) { + if (noteTh.EndingConsonant == null) { + if (checkOtoUntilHit(new string[] { noteTh.Vowel + "-" }, note, out var tempOto)) { + tests.Add(tempOto.Alias); + } + } else { + if (checkOtoUntilHit(new string[] { tests[tests.Count - 1] + "-" }, note, out var tempOto)) { + tests[tests.Count - 1] = (tempOto.Alias); + } + } + } + + if (tests.Count <= 0) { + if (checkOtoUntilHit(new string[] { currentLyric }, note, out var tempOto)) { + tests.Add(currentLyric); + } + } + + if (checkOtoUntilHit(tests.ToArray(), note, out var oto)) { + + var noteDuration = notes.Sum(n => n.duration); + + for (int i = 0; i < tests.ToArray().Length; i++) { + + int position = 0; + int vcPosition = noteDuration - 50; + + if (nextNeighbour != null && tests[i].Contains(" ")) + { + var nextLyric = nextNeighbour.Value.lyric.Normalize(); + if (!string.IsNullOrEmpty(nextNeighbour.Value.phoneticHint)) { + nextLyric = nextNeighbour.Value.phoneticHint.Normalize(); + } + var nextTh = ParseInput(nextLyric); + var nextCheck = nextTh.Vowel; + if (nextTh.Consonant != null) { + nextCheck = nextTh.Consonant + nextTh.Vowel; + } + if (nextTh.Consonant != null) { + nextCheck = nextTh.Consonant + nextTh.Dipthong + nextTh.Vowel; + } + if(nextTh.Dipthong != null) { + nextCheck = nextTh.Consonant + nextTh.Dipthong + nextTh.Vowel; + } + var nextAttr = nextNeighbour.Value.phonemeAttributes?.FirstOrDefault(attr => attr.index == 0.9) ?? default; + if (singer.TryGetMappedOto(nextCheck, nextNeighbour.Value.tone + nextAttr.toneShift, nextAttr.voiceColor, out var nextOto)) { + if (oto.Overlap > 30) { + vcPosition = noteDuration - MsToTick(nextOto.Overlap) - MsToTick(nextOto.Preutter); + } + } + } + + + if (noteTh.Dipthong == null || tests.Count <= 1) { + if (i == 1) { + position = Math.Max((int)(noteDuration * 0.25), vcPosition); + } + } else { + if (i == 1) { + position = Math.Min((int)(noteDuration * 0.15), 60); + } else if (i == 2) { + position = Math.Max((int)(noteDuration * 0.30), vcPosition); + } + } + + phonemes.Add(new Phoneme { phoneme = tests[i], position = position }); + } + + } + + return new Result { + phonemes = phonemes.ToArray() + }; + } + + (string Consonant, string Dipthong, string Vowel, string EndingConsonant) ParseInput(string input) { + string consonant = null; + string dipthong = null; + string vowel = null; + string endingConsonant = null; + + if (input == null) { + return (null, null, null, null); + } + + if (input.Length > 3) { + foreach (var dip in diphthongs) { + if (input[1].ToString() || input[1].Equals(dip) || input[1].ToString().Equals(dip)) { + dipthong = dip; + } + } + } + if else (input.Length > 4) { + foreach (var dip in diphthongs + endingConsonant) { + if (input[1].ToString() || input[1].Equals(dip) || input[1].ToString() || input[1].ToString() Equals(EC)) { + dipthong = dip endingConsonant = EC; + } + } + } + + foreach (var con in consonants) { + if (input.StartsWith(con)) { + if (consonant == null || consonant.Length + con.Length) { + consonant = con; + } + } + if (input.EndsWith(con)) { + if (endingConsonant == null || endingConsonant.Length + con.Length) { + endingConsonant = con; + } + } + } + + foreach (var vow in vowels) { + if (input.Contains(vow)) { + if (vowel == null || vowel.Length = vow.Length) { + vowel = vow; + } + } + } + + return (consonant, dipthong, vowel, endingConsonant); + } + } +} From 7b15ffb0148e1b2ae31b54619ab4277b6838700a Mon Sep 17 00:00:00 2001 From: DELTA SYNTH <105579737+deltaVOCALOID09378@users.noreply.github.com> Date: Tue, 23 Sep 2025 17:31:04 +0700 Subject: [PATCH 03/12] Create Red Theme --- OpenUtau/Colors/Red Theme | 48 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 OpenUtau/Colors/Red Theme diff --git a/OpenUtau/Colors/Red Theme b/OpenUtau/Colors/Red Theme new file mode 100644 index 000000000..44985b3f1 --- /dev/null +++ b/OpenUtau/Colors/Red Theme @@ -0,0 +1,48 @@ + + true + + #303030 + #505050 + #707070 + #404040 + + #E0E0E0 + #FCFCFC + #FFFFFF + #A0A0A0 + + #707070 + #B0B0B0 + + #4EA6EA + + #90CAF9 + + #1E88E5 + + #808080 + #A0A0A0 + #4EA6EA + #FF679D + #E62E6E + + #707070 + #D0D0D0 + #D0D0D0 + #404040 + + + #CC2A63 + #FF347C + #FFFFFF + + #CCA5B0 + #FFCEDC + #FF347C + + Transparent + Transparent + #FFFFFF + From 274e922839c9a94d4839f7b8ebf3e7ff31ae2432 Mon Sep 17 00:00:00 2001 From: DELTA SYNTH <105579737+deltaVOCALOID09378@users.noreply.github.com> Date: Tue, 23 Sep 2025 17:32:32 +0700 Subject: [PATCH 04/12] Rename Red Theme to Red Theme.axml --- OpenUtau/Colors/{Red Theme => Red Theme.axml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename OpenUtau/Colors/{Red Theme => Red Theme.axml} (100%) diff --git a/OpenUtau/Colors/Red Theme b/OpenUtau/Colors/Red Theme.axml similarity index 100% rename from OpenUtau/Colors/Red Theme rename to OpenUtau/Colors/Red Theme.axml From df579bfb5101b0e3587c9247518424c3d3141301 Mon Sep 17 00:00:00 2001 From: DELTA SYNTH <105579737+deltaVOCALOID09378@users.noreply.github.com> Date: Sun, 28 Sep 2025 13:26:06 +0700 Subject: [PATCH 05/12] Update crowdin.yml Signed-off-by: DELTA SYNTH <105579737+deltaVOCALOID09378@users.noreply.github.com> --- .github/workflows/crowdin.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/crowdin.yml b/.github/workflows/crowdin.yml index 64d80fa56..d6b78e3f7 100644 --- a/.github/workflows/crowdin.yml +++ b/.github/workflows/crowdin.yml @@ -1,4 +1,4 @@ -name: Crowdin Action +name: THAI CVVC-VCCV on: workflow_dispatch: {} From b37f18ee7147fb8965a6411610f5a123b31f5ab1 Mon Sep 17 00:00:00 2001 From: DELTA SYNTH <105579737+deltaVOCALOID09378@users.noreply.github.com> Date: Mon, 27 Oct 2025 01:38:09 +0700 Subject: [PATCH 06/12] =?UTF-8?q?=E0=B8=AA=E0=B8=A3=E0=B9=89=E0=B8=B2?= =?UTF-8?q?=E0=B8=87=E0=B9=82=E0=B8=94=E0=B8=A2=E0=B9=83=E0=B8=8A=E0=B9=89?= =?UTF-8?q?=20Colab?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DiffSinger_colab_notebook.ipynb | 1741 +++++++++++++++++++++++++++++++ 1 file changed, 1741 insertions(+) create mode 100644 DiffSinger_colab_notebook.ipynb diff --git a/DiffSinger_colab_notebook.ipynb b/DiffSinger_colab_notebook.ipynb new file mode 100644 index 000000000..e1a4deaba --- /dev/null +++ b/DiffSinger_colab_notebook.ipynb @@ -0,0 +1,1741 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "collapsed_sections": [ + "MP5rRkbTpnG8", + "Wv0gfI5feBSc", + "eexZl_OCDmQ3", + "0J3b18EKdzMC", + "FY40fGHEg9_i", + "4sbU1aH5kGFE" + ], + "gpuType": "T4", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MP5rRkbTpnG8" + }, + "source": [ + "# _**[DiffSinger](https://github.com/openvpi/DiffSinger)**_\n", + "_Singing Voice Synthesis via Shallow Diffusion Mechanism (SVS & TTS)_\n", + "\n", + "\\\n", + "____\n", + "\n", + "Note:\n", + "- This notebook will get update semi-frequently based from the feedback or response from users\n", + "- Make sure to compare your file structure to the [data example](https://github.com/usamireko/DiffSinger_colab_notebook_MLo7/blob/main/data_example.md)\n", + "\n", + "```We refer \"variance\" as \"parameters\" to avoid the confusion```\n", + "\n", + "```Use export_mode if only wanting to export your ONNX files and nothing more```\n", + "\n", + "\\\n", + "____\n", + "\\\n", + "#### **This notebook is an edited copy of Kei's Diffsinger [colab notebook](https://colab.research.google.com/drive/1kUg9dz8PPH92NfnLZwgq0_9B9an39t1J?usp=sharing)**\n", + "####**This notebook is maintained by MLo7**\n", + "\n", + "___" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# **Setup**" + ], + "metadata": { + "id": "Wv0gfI5feBSc" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pK8aicf8A2sj", + "cellView": "form", + "collapsed": true + }, + "outputs": [], + "source": [ + "from IPython.display import clear_output, Audio, display, HTML\n", + "import os\n", + "from google.colab import drive\n", + "\n", + "def setup_onnx_export():\n", + " print(\"ONNX Export Mode Enabled, Installing required components\")\n", + " !git clone https://github.com/openvpi/DiffSinger.git /content/DiffSinger\n", + " !wget -O /content/mini.sh https://repo.anaconda.com/miniconda/Miniconda3-py310_25.1.1-2-Linux-x86_64.sh\n", + " !chmod +x /content/mini.sh\n", + " !bash /content/mini.sh -b -f -p /usr/local\n", + " !conda install -q -y jupyter\n", + " !conda install -q -y google-colab -c conda-forge\n", + " !python -m ipykernel install --name \"py310\" --user\n", + " print(\"installing dependencies for ONNX conversion\")\n", + " !pip install -r /content/DiffSinger/requirements-onnx.txt -q -q -q 2>/dev/null\n", + " print(\"Installation complete, time to export those ONNX!\")\n", + "\n", + "def setup_standard():\n", + " if not os.path.exists(\"/content/pretrain_models\"):\n", + " os.makedirs(\"/content/pretrain_models\")\n", + "\n", + " !wget https://github.com/MLo7Ghinsan/DiffSinger_colab_notebook_MLo7/releases/download/OU_files/jpn_dict.txt -O /content/jpn_dict.txt\n", + " !rm -rf /content/sample_data\n", + " !apt-get install aria2\n", + " clear_output()\n", + " !git clone https://github.com/UtaUtaUtau/nnsvs-db-converter /content/nnsvs-db-converter\n", + " !git clone https://github.com/openvpi/DiffSinger.git /content/DiffSinger\n", + " !git clone https://github.com/openvpi/MakeDiffSinger /content/MakeDiffSinger\n", + " !git clone https://github.com/MLo7Ghinsan/ghin_shenanigans /content/ghin_shenanigans\n", + " !git clone https://github.com/openvpi/SOME /content/SOME\n", + " clear_output()\n", + " !pip install torch torchvision torchaudio\n", + " clear_output()\n", + " !pip install -r /content/DiffSinger/requirements.txt\n", + " !pip install -r /content/SOME/requirements.txt\n", + " !pip install mido einops\n", + " clear_output()\n", + " !wget https://github.com/openvpi/vocoders/releases/download/nsf-hifigan-44.1k-hop512-128bin-2024.02/nsf_hifigan_44.1k_hop512_128bin_2024.02.zip -O /content/nsf_hifigan_44.1k_hop512_128bin_2024.02.zip\n", + " !wget https://github.com/openvpi/vocoders/releases/download/pc-nsf-hifigan-44.1k-hop512-128bin-2025.02/pc_nsf_hifigan_44.1k_hop512_128bin_2025.02.zip -O /content/pc_nsf_hifigan_44.1k_hop512_128bin_2025.02.zip\n", + " !wget https://github.com/openvpi/DiffSinger/releases/download/v2.1.0/rmvpe.zip -O /content/rmvpe.zip\n", + " !wget https://github.com/openvpi/SOME/releases/download/v1.0.0-baseline/0119_continuous128_5spk.zip -O /content/0119_continuous128_5spk.zip\n", + " !wget https://github.com/yxlllc/vocal-remover/releases/download/hnsep_240512/hnsep_240512.zip -O /content/DiffSinger/checkpoints/hnsep_240512.zip\n", + " !unzip -q /content/DiffSinger/checkpoints/hnsep_240512.zip -d /content/DiffSinger/checkpoints\n", + " !unzip -q /content/0119_continuous128_5spk.zip -d /content/DiffSinger/checkpoints/SOME\n", + " !unzip -q /content/pc_nsf_hifigan_44.1k_hop512_128bin_2025.02.zip -d /content/DiffSinger/checkpoints\n", + " !unzip -q /content/nsf_hifigan_44.1k_hop512_128bin_2024.02.zip -d /content/DiffSinger/checkpoints\n", + " !unzip -q /content/rmvpe.zip -d /content/DiffSinger/checkpoints\n", + " !unzip -q /content/rmvpe.zip -d /content/MakeDiffSinger/variance-temp-solution/assets\n", + " !rm /content/nsf_hifigan_44.1k_hop512_128bin_2024.02.zip\n", + " !rm /content/rmvpe.zip\n", + " !rm /content/0119_continuous128_5spk.zip\n", + " !aria2c -d /content/pretrain_models -o acoustic_pretrain.ckpt https://github.com/haru0l/diffsinger_models/releases/download/acoustic/model_ckpt_steps_49000.ckpt\n", + " !aria2c -d /content/pretrain_models -o variance_pretrain.ckpt https://github.com/haru0l/diffsinger_models/releases/download/variance/model_ckpt_steps_51000.ckpt\n", + " clear_output()\n", + " !pip install --upgrade tensorboard\n", + " clear_output()\n", + " !pip install protobuf\n", + " clear_output()\n", + " !pip install onnxruntime\n", + " clear_output()\n", + " !pip install pydub\n", + " clear_output()\n", + "\n", + "#@title # Mount Google Drive and Setup\n", + "export_mode = False # @param {\"type\":\"boolean\"}\n", + "drive.mount(\"/content/drive\")\n", + "\n", + "if export_mode:\n", + " setup_onnx_export()\n", + "else:\n", + " setup_standard()\n", + "\n", + "clear_output()\n", + "print(\"setup complete!\")\n", + "print(\"|\")\n", + "print(\"|\")\n", + "print(\"|\")\n", + "!git clone https://github.com/MLo7Ghinsan/ghin_shenanigans /content/ghin_shenanigans 2>/dev/null\n", + "chika_dance = ''\n", + "display(HTML(chika_dance))\n", + "with open(\"/content/ghin_shenanigans/audio/setup_complete.wav\", \"rb\") as f:\n", + " setup_complete_sound = f.read()\n", + "Audio(data=setup_complete_sound, autoplay=True)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# **Preprocess data for training**" + ], + "metadata": { + "id": "eexZl_OCDmQ3" + } + }, + { + "cell_type": "code", + "source": [ + "#@title #Extract Data\n", + "#@markdown ___\n", + "%cd /content\n", + "#@markdown this cell will create a folder name [raw_data] in the root folder of colab (/content) and extract your data into it\n", + "\n", + "data_type = \"lab + wav (NNSVS format)\" # @param [\"lab + wav (NNSVS format)\", \"csv + wav (DiffSinger format)\", \"ds (DiffSinger format)\"]\n", + "\n", + "#@markdown The path to your data zip file\n", + "\n", + "data_zip_path = \"\" #@param {type:\"string\"}\n", + "\n", + "#@markdown ___\n", + "\n", + "#@markdown nnsvs-db-converter settings (lab + wav ONLY)\n", + "\n", + "#@markdown _These values can exceed the amount that's in your data to maximize the segment length or to keep the data as is_\n", + "\n", + "#@markdown This option is necessary for variance's pitch training\n", + "estimate_midi_option = \"False\" # @param [\"False\", \"True | parselmouth\", \"True | harvest\", \"True | SOME\"]\n", + "if estimate_midi_option == \"True | parselmouth\":\n", + " estimate_midi = True\n", + " midi_pitch_ext = \"parselmouth\"\n", + "elif estimate_midi_option == \"True | harvest\":\n", + " estimate_midi = True\n", + " midi_pitch_ext = \"harvest\"\n", + "else:\n", + " estimate_midi = False\n", + " midi_pitch_ext = None\n", + "#@markdown Determine how long it will segment your data to based on silence phoneme placement (seconds)\n", + "segment_length = 15 #@param {type:\"slider\", min:5, max:35, step:1}\n", + "\n", + "#@markdown Determine how many silence phoneme is allowed in the middle of each segment\n", + "max_silence_phoneme_amount = 2 #@param {type:\"slider\", min:0, max:50, step:1}\n", + "\n", + "# leaving -S at 60 so max silence can be 60 seconds that exceeds the segment legnth cap idk why///\n", + "# making the segment length cap at 35 secs because any longer than that would make training goes really slow\n", + "\n", + "# my ass dont remember why i made two... i think one is unnecessary extra but mehhh\n", + "all_shits = \"/content/raw_data\"\n", + "all_shits_not_wav_n_lab = \"/content/raw_data/diffsinger_db\"\n", + "\n", + "import os\n", + "import csv\n", + "import json\n", + "import shutil\n", + "from pydub import AudioSegment\n", + "import yaml\n", + "\n", + "if os.path.exists(\"/content/raw_data\"):\n", + " shutil.rmtree(\"/content/raw_data\")\n", + "\n", + "if not os.path.exists(all_shits_not_wav_n_lab):\n", + " os.makedirs(all_shits_not_wav_n_lab)\n", + "\n", + "# using 'if not' bc i edited the wrong section which im also too lazy to fix it <3\n", + "if not data_type == \"lab + wav (NNSVS format)\":\n", + " #changed to 7zip to support more compression types\n", + " !7z x \"$data_zip_path\" -o{all_shits_not_wav_n_lab}\n", + " for root, dirs, files in os.walk(all_shits):\n", + " for filename in files:\n", + " if filename.endswith(\".lab\"):\n", + " file_path = os.path.join(root, filename)\n", + " with open(file_path, \"r\") as file:\n", + " file_data = file.read()\n", + " file_data = file_data.replace(\"SP\", \"pau\")\n", + " file_data = file_data.replace(\"br\", \"AP\")\n", + " with open(file_path, \"w\") as file:\n", + " file.write(file_data)\n", + "\n", + "else:\n", + " !7z x \"$data_zip_path\" -o{all_shits_not_wav_n_lab}\n", + "\n", + "\n", + "# for funny auto dict generator lmao\n", + "out = \"/content/DiffSinger/dictionaries\"\n", + "dictionary_files = []\n", + "dictionary_conf_lines = []\n", + "\n", + "def is_excluded(phoneme):\n", + " return phoneme in [\"pau\", \"AP\", \"SP\", \"sil\"]\n", + "\n", + "lang_config_path = all_shits_not_wav_n_lab +\"/lang_config.yaml\"\n", + "\n", + "if not os.path.exists(lang_config_path):\n", + " extra_phonemes = []\n", + " merged_phoneme_groups = []\n", + " all_phonemes = set()\n", + "\n", + " for root, dirs, files in os.walk(all_shits_not_wav_n_lab):\n", + " for file in files:\n", + " fpath = os.path.join(root, file)\n", + " # honestly if people still have whatever/phoneme in their single dict, they shouldnt be doing single dict in the first place\n", + " if file.endswith(\".lab\"):\n", + " with open(fpath, \"r\") as lab_file:\n", + " for line in lab_file:\n", + " parts = line.strip().split()\n", + " if len(parts) < 3:\n", + " continue\n", + " phoneme = parts[2]\n", + " if \"/\" in phoneme:\n", + " _, phoneme = phoneme.split(\"/\", 1)\n", + " if not is_excluded(phoneme):\n", + " all_phonemes.add(phoneme)\n", + "\n", + " elif file.endswith(\".csv\"):\n", + " with open(fpath, \"r\", newline=\"\") as csv_file:\n", + " csv_reader = csv.DictReader(csv_file)\n", + " for row in csv_reader:\n", + " if \"ph_seq\" in row:\n", + " for phoneme in row[\"ph_seq\"].strip().split():\n", + " if \"/\" in phoneme:\n", + " _, phoneme = phoneme.split(\"/\", 1)\n", + " if not is_excluded(phoneme):\n", + " all_phonemes.add(phoneme)\n", + "\n", + " elif file.endswith(\".ds\"):\n", + " with open(fpath, \"r\") as json_file:\n", + " data = json.load(json_file)\n", + " for entry in data:\n", + " if \"ph_seq\" in entry:\n", + " for phoneme in entry[\"ph_seq\"].strip().split():\n", + " if \"/\" in phoneme:\n", + " _, phoneme = phoneme.split(\"/\", 1)\n", + " if not is_excluded(phoneme):\n", + " all_phonemes.add(phoneme)\n", + "\n", + " os.makedirs(out, exist_ok=True)\n", + " custom_dict_path = os.path.join(out, \"dictionary-custom.txt\")\n", + " dictionary_files.append(custom_dict_path)\n", + " dictionary_conf_lines.append(f\"custom: '{custom_dict_path}'\")\n", + " with open(custom_dict_path, \"w\", encoding=\"utf-8\") as out_file:\n", + " for phoneme in sorted(all_phonemes):\n", + " out_file.write(f\"{phoneme}\\t{phoneme}\\n\")\n", + " lang_dict = None\n", + "\n", + "else:\n", + " with open(lang_config_path, \"r\") as yaml_file:\n", + " lang_config = yaml.safe_load(yaml_file)\n", + "\n", + " languages = lang_config.get(\"languages\", [])\n", + " extra_phonemes = lang_config.get(\"extra_phonemes\", [])\n", + " merged_phoneme_groups = lang_config.get(\"merged_phoneme_groups\", [])\n", + "\n", + " lang_dict = {lang: set() for lang in languages}\n", + "\n", + " for folder in os.listdir(all_shits_not_wav_n_lab):\n", + " if \".\" in folder:\n", + " _, lang_code = folder.rsplit(\".\", 1)\n", + " if lang_code not in languages:\n", + " continue\n", + "\n", + " phoneme_folder_path = os.path.join(all_shits_not_wav_n_lab, folder)\n", + "\n", + " for root, dirs, files in os.walk(phoneme_folder_path):\n", + " for file in files:\n", + " fpath = os.path.join(root, file)\n", + "\n", + " if data_type == \"lab + wav (NNSVS format)\":\n", + " if file.endswith(\".lab\"):\n", + " with open(fpath, \"r\") as lab_file:\n", + " for line in lab_file:\n", + " line = line.strip()\n", + " if not line:\n", + " continue\n", + " parts = line.split()\n", + " if len(parts) < 3:\n", + " continue\n", + " phoneme = parts[2]\n", + " if \"/\" in phoneme:\n", + " lang_hint, actual_phoneme = phoneme.split(\"/\", 1)\n", + " if lang_hint in languages and not is_excluded(actual_phoneme):\n", + " lang_dict[lang_hint].add(actual_phoneme)\n", + " continue\n", + " if not is_excluded(phoneme):\n", + " lang_dict[lang_code].add(phoneme)\n", + "\n", + " elif data_type == \"csv + wav (DiffSinger format)\":\n", + " if file.endswith(\".csv\"):\n", + " with open(fpath, \"r\", newline=\"\") as csv_file:\n", + " csv_reader = csv.DictReader(csv_file)\n", + " for row in csv_reader:\n", + " if \"ph_seq\" in row:\n", + " ph_seq = row[\"ph_seq\"].strip()\n", + " for phoneme in ph_seq.split():\n", + " if \"/\" in phoneme:\n", + " lang_hint, actual_phoneme = phoneme.split(\"/\", 1)\n", + " if lang_hint in languages and not is_excluded(actual_phoneme):\n", + " lang_dict[lang_hint].add(actual_phoneme)\n", + " continue\n", + " if not is_excluded(phoneme):\n", + " lang_dict[lang_code].add(phoneme)\n", + "\n", + " else:\n", + " if file.endswith(\".ds\"):\n", + " with open(fpath, \"r\") as json_file:\n", + " data = json.load(json_file)\n", + " for entry in data:\n", + " if \"ph_seq\" in entry:\n", + " ph_seq = entry[\"ph_seq\"].strip()\n", + " for phoneme in ph_seq.split():\n", + " if \"/\" in phoneme:\n", + " lang_hint, actual_phoneme = phoneme.split(\"/\", 1)\n", + " if lang_hint in languages and not is_excluded(actual_phoneme):\n", + " lang_dict[lang_hint].add(actual_phoneme)\n", + " continue\n", + " if not is_excluded(phoneme):\n", + " lang_dict[lang_code].add(phoneme)\n", + "\n", + " for lang, ph_set in lang_dict.items():\n", + " output_path = os.path.join(out, f\"dictionary-{lang}.txt\")\n", + " dictionary_files.append(output_path)\n", + " dictionary_conf_lines.append(f\"{lang}: '{output_path}'\")\n", + " with open(output_path, \"w\", encoding=\"utf-8\") as out_file:\n", + " for phoneme in sorted(ph_set):\n", + " out_file.write(f\"{phoneme}\\t{phoneme}\\n\")\n", + "\n", + "# used this for check runs\n", + "#for dicks in dictionary_files:\n", + "# print(dicks)\n", + "\n", + "# for vowels and consonants.txt.... well adding luquid type for uta's script\n", + "dict_path = out\n", + "vowel_types = {\"a\", \"i\", \"u\", \"e\", \"o\", \"N\", \"M\", \"NG\"}\n", + "liquid_types = {\"y\", \"w\", \"l\", \"r\"} # r for english labels, it should be fine with jp too\n", + "vowel_data = []\n", + "consonant_data = []\n", + "liquid_data = []\n", + "\n", + "for dict_path in dictionary_files:\n", + " with open(dict_path, \"r\") as f:\n", + " for line in f:\n", + " phoneme, _ = line.strip().split(\"\\t\")\n", + " if phoneme[0] in vowel_types:\n", + " vowel_data.append(phoneme)\n", + " elif phoneme[0] in liquid_types:\n", + " liquid_data.append(phoneme)\n", + " else:\n", + " consonant_data.append(phoneme)\n", + "\n", + "vowel_data.sort()\n", + "liquid_data.sort()\n", + "consonant_data.sort()\n", + "directory = os.path.dirname(dict_path)\n", + "\n", + "# make txt for language json file\n", + "vowel_txt_path = os.path.join(directory, \"vowels.txt\")\n", + "with open(vowel_txt_path, \"w\") as f:\n", + " f.write(\" \".join(vowel_data))\n", + "liquid_txt_path = os.path.join(directory, \"liquids.txt\")\n", + "with open(liquid_txt_path, \"w\") as f:\n", + " f.write(\" \".join(liquid_data))\n", + "consonant_txt_path = os.path.join(directory, \"consonants.txt\")\n", + "with open(consonant_txt_path, \"w\") as f:\n", + " f.write(\" \".join(consonant_data))\n", + "\n", + "\n", + "# here's a funny json append\n", + "with open(vowel_txt_path, \"r\") as f:\n", + " vowel_data = f.read().split()\n", + "with open(liquid_txt_path, \"r\") as f:\n", + " liquid_data = f.read().split()\n", + "with open(consonant_txt_path, \"r\") as f:\n", + " consonant_data = f.read().split()\n", + "liquid_list = {liquid: True for liquid in liquid_data} #temp fix, might need more research about the push in timing'''\n", + "phones4json = {\"vowels\": vowel_data, \"liquids\": liquid_list}\n", + "with open(\"/content/nnsvs-db-converter/lang.sample.json\", \"w\") as rawr:\n", + " json.dump(phones4json, rawr, indent=4)\n", + "\n", + "\n", + "if data_type == \"lab + wav (NNSVS format)\":\n", + " db_converter_script = \"/content/nnsvs-db-converter/db_converter.py\"\n", + " for raw_folder_name in os.listdir(all_shits_not_wav_n_lab):\n", + " raw_folder_path = os.path.join(all_shits_not_wav_n_lab, raw_folder_name)\n", + " if os.path.isdir(raw_folder_path):\n", + " if estimate_midi:\n", + " !python {db_converter_script} -s {max_silence_phoneme_amount} -l {segment_length} -m -c -L \"/content/nnsvs-db-converter/lang.sample.json\" {raw_folder_path}\n", + " else:\n", + " !python {db_converter_script} -s {max_silence_phoneme_amount} -l {segment_length} -L \"/content/nnsvs-db-converter/lang.sample.json\" {raw_folder_path}\n", + " !rm -rf {raw_folder_path}/*.wav {raw_folder_path}/*.lab\n", + " !mv {raw_folder_path}/diffsinger_db/* {raw_folder_path} 2> /dev/null\n", + " !rm -rf {raw_folder_path}/diffsinger_db\n", + " if estimate_midi_option == \"True | SOME\":\n", + " !python /content/SOME/batch_infer.py --model \"/content/DiffSinger/checkpoints/SOME/0119_continuous256_5spk/model_ckpt_steps_100000_simplified.ckpt\" --dataset {raw_folder_path} --overwrite\n", + "\n", + "elif data_type == \"ds (DiffSinger format)\":\n", + " ds_segment_script = \"/content/ghin_shenanigans/scripts/ds_segmentor.py\"\n", + " ds2csv_script = \"/content/MakeDiffSinger/variance-temp-solution/convert_ds.py\"\n", + " for raw_folder_name in os.listdir(all_shits_not_wav_n_lab):\n", + " raw_folder_path = os.path.join(all_shits_not_wav_n_lab, raw_folder_name)\n", + " if os.path.isdir(raw_folder_path):\n", + " ds_exp_path = os.path.join(raw_folder_path, \"ds\")\n", + " csv_exp_path = os.path.join(raw_folder_path, \"transcriptions.csv\")\n", + " !python {ds_segment_script} {raw_folder_path} --export_path {ds_exp_path}\n", + " !rm -rf {raw_folder_path}/*.ds #clean it cus why not\n", + " !python {ds2csv_script} ds2csv {ds_exp_path} {csv_exp_path}\n", + "else:\n", + " pass\n", + "\n", + "# make it replace the first SP to AP cus it seems like people always forgot about it\n", + "for root, _, files in os.walk(all_shits_not_wav_n_lab):\n", + " for file in files:\n", + " if file.endswith(\".csv\"):\n", + " file_path = os.path.join(root, file)\n", + " with open(file_path, \"r\", newline=\"\") as input_file:\n", + " csv_reader = csv.reader(input_file)\n", + " data = [row for row in csv_reader]\n", + " header = data[0]\n", + " if \"ph_seq\" in header:\n", + " ph_seq_index = header.index(\"ph_seq\")\n", + " if len(data) > 1 and len(data[1]) > ph_seq_index:\n", + " data[1][ph_seq_index] = data[1][ph_seq_index].replace(\"SP\", \"AP\", 1)\n", + " with open(file_path, \"w\", newline=\"\") as output_file:\n", + " csv_writer = csv.writer(output_file)\n", + " csv_writer.writerows(data)\n", + "\n", + "print(\"extraction complete!\")\n", + "print(\"|\")\n", + "print(\"|\")\n", + "print(\"|\")\n", + "print(\"I'm also nice enough to convert your data and also write your dictionaries lmao. You are welcome :)\")" + ], + "metadata": { + "cellView": "form", + "id": "JsP1TGg2F1g3" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#@title #Edit Config\n", + "#@markdown ___\n", + "\n", + "import re\n", + "import os\n", + "import yaml\n", + "import random #for the random test files lmaoz\n", + "\n", + "%cd /content\n", + "clear_output()\n", + "#@markdown The model type user is training\n", + "model_type = \"acoustic\" # @param [\"acoustic\", \"variance\"]\n", + "config_cap = model_type.upper()\n", + "diffusion_type = \"reflow\" # @param [\"ddpm\", \"reflow\"]\n", + "diff_accelerator = \"unipc\" # @param [\"ddim\", \"pndm\", \"dpm-solver\", \"unipc\"]\n", + "loss_type = \"l2\" # @param [\"l1\", \"l2\"]\n", + "\n", + "spk_name = [folder_name for folder_name in os.listdir(all_shits_not_wav_n_lab) if os.path.isdir(os.path.join(all_shits_not_wav_n_lab, folder_name))]\n", + "# i used spk_name for something else cus i forgor now imma just copy and paste it\n", + "spk_names = [folder_name for folder_name in os.listdir(all_shits_not_wav_n_lab) if os.path.isdir(os.path.join(all_shits_not_wav_n_lab, folder_name))]\n", + "num_spk = len(spk_name)\n", + "num_lang = len(dictionary_files)\n", + "raw_dir = []\n", + "datasets = []\n", + "for folder_name in spk_name:\n", + " folder_path = os.path.join(all_shits_not_wav_n_lab, folder_name)\n", + " raw_dir.append(folder_path)\n", + "folder_to_id = {folder_name: i for i, folder_name in enumerate(spk_name)}\n", + "\n", + "if num_spk == 1:\n", + " singer_type = \"SINGLE-SPEAKER\"\n", + " use_spk_id = False\n", + "\n", + " for spk_id, (folder_path, speaker_name) in enumerate(zip(raw_dir, spk_name)):\n", + " if data_type == \"ds (DiffSinger format)\":\n", + " audio_files = [f[:-4] for f in os.listdir(folder_path) if f.endswith(\".ds\")]\n", + " else:\n", + " audio_files = [f[:-4] for f in os.listdir(folder_path + \"/wavs\") if f.endswith(\".wav\")]\n", + " folder_id = folder_to_id.get(speaker_name, -1)\n", + " prefixed_audio_files = [f\"{audio_file}\" for audio_file in audio_files]\n", + "\n", + " random_ass_test_files = prefixed_audio_files[:3]\n", + "\n", + " speaker_name, lang_id = os.path.splitext(speaker_name) #tfw i forgot this last time\n", + "\n", + " datasets.append({\n", + " \"raw_data_dir\": folder_path,\n", + " \"speaker\": speaker_name,\n", + " \"spk_id\": 0,\n", + " \"language\": \"custom\",\n", + " \"test_prefixes\": random_ass_test_files\n", + " })\n", + "else:\n", + " singer_type = \"MULTI-SPEAKER\"\n", + " use_spk_id = True\n", + "\n", + " for spk_id, (folder_path, speaker_name) in enumerate(zip(raw_dir, spk_name)):\n", + " if data_type == \"ds (DiffSinger format)\":\n", + " audio_files = [f[:-4] for f in os.listdir(folder_path) if f.endswith(\".ds\")]\n", + " else:\n", + " audio_files = [f[:-4] for f in os.listdir(folder_path + \"/wavs\") if f.endswith(\".wav\")]\n", + " folder_id = folder_to_id.get(speaker_name, -1)\n", + " prefixed_audio_files = [f\"{audio_file}\" for audio_file in audio_files]\n", + "\n", + " random_ass_test_files = prefixed_audio_files[:3]\n", + "\n", + " speaker_name, lang_id = os.path.splitext(speaker_name) #tfw i forgot this last time\n", + "\n", + " datasets.append({\n", + " \"raw_data_dir\": folder_path,\n", + " \"speaker\": speaker_name,\n", + " \"spk_id\": spk_id,\n", + " \"language\": lang_id.lstrip(\".\") or \"custom\",\n", + " \"test_prefixes\": random_ass_test_files\n", + " })\n", + "\n", + "dictionaries = {}\n", + "for line in dictionary_conf_lines:\n", + " key, value = line.split(\": \", 1)\n", + " dictionaries[key] = value.strip(\"'\")\n", + "\n", + "#@markdown Shallow Diffusion training\n", + "use_shallow_diffusion = \"true | gt_val\" # @param [\"false\", \"true | aux_val\", \"true | gt_val\"]\n", + "if use_shallow_diffusion == \"false\":\n", + " shallow = False\n", + " gt_shallow = False\n", + "elif use_shallow_diffusion == \"true | aux_val\":\n", + " shallow = True\n", + " gt_shallow = False\n", + "else:\n", + " shallow = True\n", + " gt_shallow = True\n", + "\n", + "#@markdown Half precision, or mixed precision can result in improved performance, achieving speedups on training (from [doc](https://lightning.ai/docs/pytorch/stable/common/trainer.html#precision))\n", + "# the reason why i dont add 64 is because colab is already dreadfully slow at 32 so yes im leaving it out\n", + "precision = \"16-mixed\" # @param [\"32-true\", \"bf16-mixed\", \"16-mixed\", \"bf16\", \"16\"]\n", + "\n", + "#@markdown User model save path\n", + "save_dir = \"\" #@param {type:\"string\"}\n", + "\n", + "binary_save_dir = save_dir + \"/binary\"\n", + "\n", + "conf_dir = save_dir\n", + "\n", + "#@markdown ....................................................................................................................................................................................................................................................................................................................................................................................................................................\n", + "\n", + "#@markdown Option to use base model for finetuning\n", + "\n", + "enable_finetuning = False # @param {type:\"boolean\"}\n", + "\n", + "\n", + "#@markdown Path to custom base model, leave blank to use [default](https://github.com/haru0l/diffsinger_models) models\n", + "#wtf haru i just looked at your readme\"\"\"\"\"\n", + "\n", + "base_model_path = \"\" # @param {type:\"string\"}\n", + "\n", + "if enable_finetuning:\n", + " pretrain = True\n", + " if base_model_path:\n", + " pretrain_ckpt = base_model_path\n", + " else:\n", + " pretrain_ckpt = f\"/content/pretrain_models/{model_type}_pretrain.ckpt\"\n", + " finetune_strict_shapes = False\n", + " finetune_ckpt_path = pretrain_ckpt\n", + "else:\n", + " pretrain = False\n", + " finetune_strict_shapes = True #default value\n", + " finetune_ckpt_path = None #default value\n", + "\n", + "#@markdown ....................................................................................................................................................................................................................................................................................................................................................................................................................................\n", + "\n", + "#@markdown Model embeds check; Tension, Energy, Breathiness, Voicing | for both acoustic and variance\n", + "\n", + "#@markdown we limited the pair up choice to prevent the quality and usage issue, if user wish to enable option(s) outside of these choices then please keep in mind that most of these embeds do not work well together except for [energy + breathiness]\n", + "\n", + "selected_param = \"tension + voicing\" # @param [\"energy\", \"breathiness\", \"energy + breathiness\", \"tension\", \"voicing\", \"tension + voicing\", \"none\"]\n", + "param_flags = {\n", + " \"energy\": {\"tension\": False, \"energy\": True, \"breathiness\": False, \"voicing\": False},\n", + " \"breathiness\": {\"tension\": False, \"energy\": False, \"breathiness\": True, \"voicing\": False},\n", + " \"energy + breathiness\": {\"tension\": False, \"energy\": True, \"breathiness\": True, \"voicing\": False},\n", + " \"tension\": {\"tension\": True, \"energy\": False, \"breathiness\": False, \"voicing\": False},\n", + " \"voicing\": {\"tension\": False, \"energy\": False, \"breathiness\": False, \"voicing\": True},\n", + " \"tension + voicing\": {\"tension\": True, \"energy\": False, \"breathiness\": False, \"voicing\": True},\n", + " \"none\": {\"tension\": False, \"energy\": False, \"breathiness\": False, \"voicing\": False},\n", + "}\n", + "\n", + "flags = param_flags.get(selected_param, param_flags[\"none\"])\n", + "\n", + "tension_training = flags[\"tension\"]\n", + "energy_training = flags[\"energy\"]\n", + "breathiness_training = flags[\"breathiness\"]\n", + "voicing_training = flags[\"voicing\"]\n", + "\n", + "parameter_extraction_method = \"vr\" # @param [\"vr\", \"world\"]\n", + "\n", + "### forcing data aug to be true by default cus i dont think anyone would disable it and its good to be on by default\n", + "data_aug = True #param {type:\"boolean\"}\n", + "\n", + "#@markdown ....................................................................................................................................................................................................................................................................................................................................................................................................................................\n", + "\n", + "#@markdown Model training check | for variance only\n", + "\n", + "\n", + "\n", + "#@markdown due to skill issues, if user wish to train with glide embed, please enable it manually in the config\n", + "pitch_training = \"False\" # @param [\"False\", \"True | Standard\", \"True | MelodyEncoder\"]\n", + "if pitch_training == \"False\":\n", + " pitch_training = False\n", + " use_melody_encoder = False\n", + " use_glide_embed = False\n", + "elif pitch_training == \"True | Standard\":\n", + " pitch_training = True\n", + " use_melody_encoder = False\n", + " use_glide_embed = False\n", + "else:\n", + " pitch_training = True\n", + " use_melody_encoder = True\n", + " use_glide_embed = False\n", + "\n", + "duration_training = True #@param {type: \"boolean\"}\n", + "\n", + "#@markdown ....................................................................................................................................................................................................................................................................................................................................................................................................................................\n", + "\n", + "#@markdown Pitch extractor algorithm\n", + "\n", + "f0_ext = \"parselmouth\" # @param [\"parselmouth\", \"rmvpe\", \"harvest\"]\n", + "if f0_ext == \"rmvpe\":\n", + " pe_ckpt_pth = \"checkpoints/rmvpe/model.pt\"\n", + "else:\n", + " pe_ckpt_pth = None\n", + "\n", + "#@markdown ....................................................................................................................................................................................................................................................................................................................................................................................................................................\n", + "\n", + "#@markdown Proceeding sections are the parameters that will greatly affect the model's final quality and size. Read about them [here](https://github.com/openvpi/DiffSinger/blob/main/docs/ConfigurationSchemas.md)\n", + "\n", + "#@markdown So if you don't know what they do then please leave these options at default , otherwise it could affect your model badly\n", + "\n", + "#@markdown anyone is welcome to experiment though\n", + "\n", + "#@markdown model_hidden_size: hidden layers for FS2 and token param embeds\n", + "\n", + "#@markdown model_residual_layers | model_residual_channels: the model's main layers and channels\n", + "\n", + "#@markdown ....................................................................................................................................................................................................................................................................................................................................................................................................................................\n", + "#@markdown Model's network/layer size for acoustic\n", + "\n", + "#@markdown The quality of samplig_algorithm is in order, range from euler being the LEAST accurate to rk5 being the MOST accurate.... Though euler works fine on most cases\n", + "sampling_algorithm = \"euler\" # @param [\"euler\", \"rk2\", \"rk4\", \"rk5\"]\n", + "\n", + "acoustic_hidden_size = 256 # @param {type:\"slider\", min:2, max:1024, step:2}\n", + "\n", + "acoustic_num_layers = 6 # @param {type:\"slider\", min:2, max:42, step:2}\n", + "acoustic_num_channels = 1024 # @param {type:\"slider\", min:2, max:2048, step:2}\n", + "\n", + "#@markdown Model's network/layer size for variance\n", + "variance_hidden_size = 256 # @param {type:\"slider\", min:2, max:1024, step:2}\n", + "duration_hidden_size = 512 # @param {type:\"slider\", min:2, max:1024, step:2}\n", + "melody_encoder_hidden_size = 128 # @param {type:\"slider\", min:2, max:1024, step:2}\n", + "\n", + "pitch_num_layers = 6 # @param {type:\"slider\", min:2, max:100, step:2}\n", + "pitch_num_channels = 512 # @param {type:\"slider\", min:2, max:1024, step:2}\n", + "variance_num_layers = 6 # @param {type:\"slider\", min:2, max:100, step:2}\n", + "variance_num_channels = 384 # @param {type:\"slider\", min:2, max:1024, step:2}\n", + "\n", + "\n", + "\n", + "with open(\"/content/DiffSinger/configs/base.yaml\", \"r\") as config:\n", + " mother = yaml.safe_load(config)\n", + "mother[\"pl_trainer_precision\"] = precision\n", + "with open(\"/content/DiffSinger/configs/base.yaml\", \"w\") as config:\n", + " yaml.dump(mother, config)\n", + "\n", + "if data_type == \"ds (DiffSinger format)\":\n", + " prefer_ds = True\n", + "else:\n", + " prefer_ds = False\n", + "\n", + "if model_type == \"acoustic\":\n", + " with open(\"/content/DiffSinger/configs/acoustic.yaml\", \"r\") as config:\n", + " bitch_ass_config = yaml.safe_load(config)\n", + " bitch_ass_config[\"datasets\"] = datasets\n", + " bitch_ass_config[\"num_spk\"] = num_spk\n", + " bitch_ass_config[\"use_spk_id\"] = use_spk_id\n", + " bitch_ass_config[\"extra_phonemes\"] = extra_phonemes\n", + " bitch_ass_config[\"merged_phoneme_groups\"] = merged_phoneme_groups\n", + " bitch_ass_config[\"use_lang_id\"] = bool(merged_phoneme_groups)\n", + " bitch_ass_config[\"num_lang\"] = num_lang\n", + " bitch_ass_config[\"pretrain\"] = pretrain\n", + " bitch_ass_config[\"diffusion_type\"] = diffusion_type\n", + " bitch_ass_config[\"diff_accelerator\"] = diff_accelerator\n", + " bitch_ass_config[\"main_loss_type\"] = loss_type\n", + " bitch_ass_config[\"binary_data_dir\"] = binary_save_dir\n", + " bitch_ass_config[\"dictionaries\"] = dictionaries\n", + " bitch_ass_config[\"augmentation_args\"][\"random_pitch_shifting\"][\"enabled\"] = data_aug\n", + " bitch_ass_config[\"augmentation_args\"][\"random_time_stretching\"][\"enabled\"] = data_aug\n", + " bitch_ass_config[\"use_key_shift_embed\"] = data_aug\n", + " bitch_ass_config[\"use_speed_embed\"] = data_aug\n", + " bitch_ass_config[\"pe\"] = f0_ext\n", + " bitch_ass_config[\"use_energy_embed\"] = energy_training\n", + " bitch_ass_config[\"use_breathiness_embed\"] = breathiness_training\n", + " bitch_ass_config[\"use_tension_embed\"] = tension_training\n", + " bitch_ass_config[\"use_voicing_embed\"] = voicing_training\n", + "\n", + " bitch_ass_config[\"pe_ckpt\"] = pe_ckpt_pth\n", + " bitch_ass_config[\"tension_smooth_width\"] = 0.06 #0.12\n", + " #shallow diff stuff\n", + " bitch_ass_config[\"use_shallow_diffusion\"] = shallow\n", + " bitch_ass_config[\"shallow_diffusion_args\"][\"val_gt_start\"] = gt_shallow\n", + " #finetue stuff\n", + " bitch_ass_config[\"finetune_enabled\"] = enable_finetuning\n", + " bitch_ass_config[\"finetune_ckpt_path\"] = finetune_ckpt_path\n", + " bitch_ass_config[\"finetune_strict_shapes\"] = finetune_strict_shapes\n", + " #vr\n", + " bitch_ass_config[\"hnsep\"] = parameter_extraction_method\n", + " #layers\n", + " bitch_ass_config[\"sampling_algorithm\"] = sampling_algorithm\n", + " bitch_ass_config[\"hidden_size\"] = acoustic_hidden_size\n", + " bitch_ass_config[\"backbone_type\"] = \"lynxnet\"\n", + " bitch_ass_config[\"backbone_args\"][\"num_layers\"] = acoustic_num_layers\n", + " bitch_ass_config[\"backbone_args\"][\"num_channels\"] = acoustic_num_channels\n", + "\n", + " with open(\"/content/DiffSinger/configs/acoustic.yaml\", \"w\") as config:\n", + " yaml.dump(bitch_ass_config, config)\n", + "else:\n", + " with open(\"/content/DiffSinger/configs/variance.yaml\", \"r\") as config:\n", + " bitch_ass_config = yaml.safe_load(config)\n", + " bitch_ass_config[\"datasets\"] = datasets\n", + " bitch_ass_config[\"num_spk\"] = num_spk\n", + " bitch_ass_config[\"use_spk_id\"] = use_spk_id\n", + " bitch_ass_config[\"extra_phonemes\"] = extra_phonemes\n", + " bitch_ass_config[\"merged_phoneme_groups\"] = merged_phoneme_groups\n", + " bitch_ass_config[\"use_lang_id\"] = bool(merged_phoneme_groups)\n", + " bitch_ass_config[\"num_lang\"] = num_lang\n", + " bitch_ass_config[\"main_loss_type\"] = loss_type\n", + " bitch_ass_config[\"diffusion_type\"] = diffusion_type\n", + " bitch_ass_config[\"diff_accelerator\"] = diff_accelerator\n", + " bitch_ass_config[\"binary_data_dir\"] = binary_save_dir\n", + " bitch_ass_config[\"dictionaries\"] = dictionaries\n", + " bitch_ass_config[\"pe\"] = f0_ext # i think variance uses it for pitch ref as ground-truth for pitch training soooo\n", + " bitch_ass_config[\"pe_ckpt\"] = pe_ckpt_pth #same goes to this one\n", + " bitch_ass_config[\"tension_smooth_width\"] = 0.06 #0.12\n", + "\n", + " bitch_ass_config[\"predict_energy\"] = energy_training\n", + " bitch_ass_config[\"predict_breathiness\"] = breathiness_training\n", + " bitch_ass_config[\"predict_tension\"] = tension_training\n", + " bitch_ass_config[\"predict_pitch\"] = pitch_training\n", + " bitch_ass_config[\"predict_voicing\"] = voicing_training\n", + "\n", + " bitch_ass_config[\"use_melody_encoder\"] = use_melody_encoder\n", + " bitch_ass_config[\"use_glide_embed\"] = use_glide_embed\n", + " bitch_ass_config[\"predict_dur\"] = duration_training\n", + " bitch_ass_config[\"binarization_args\"][\"prefer_ds\"] = prefer_ds\n", + " #finetune stuff\n", + " bitch_ass_config[\"finetune_enabled\"] = enable_finetuning\n", + " bitch_ass_config[\"finetune_ckpt_path\"] = finetune_ckpt_path\n", + " bitch_ass_config[\"finetune_strict_shapes\"] = finetune_strict_shapes\n", + " #vr\n", + " bitch_ass_config[\"hnsep\"] = parameter_extraction_method\n", + " bitch_ass_config[\"hnsep_ckpt\"] = \"checkpoints/vr/model.pt\"\n", + " #layers\n", + " bitch_ass_config[\"hidden_size\"] = variance_hidden_size\n", + " bitch_ass_config[\"dur_prediction_args\"][\"hidden_size\"] = duration_hidden_size\n", + " bitch_ass_config[\"melody_encoder_args\"][\"hidden_size\"] = melody_encoder_hidden_size\n", + " bitch_ass_config[\"variances_prediction_args\"][\"backbone_type\"] = \"lynxnet\"\n", + " bitch_ass_config[\"variances_prediction_args\"][\"backbone_args\"][\"num_layers\"] = variance_num_layers\n", + " bitch_ass_config[\"variances_prediction_args\"][\"backbone_args\"][\"num_channels\"] = variance_num_channels\n", + " bitch_ass_config[\"pitch_prediction_args\"][\"backbone_type\"] = \"lynxnet\"\n", + " bitch_ass_config[\"pitch_prediction_args\"][\"backbone_args\"][\"num_layers\"] = pitch_num_layers\n", + " bitch_ass_config[\"pitch_prediction_args\"][\"backbone_args\"][\"num_channels\"] = pitch_num_channels\n", + "\n", + " with open(\"/content/DiffSinger/configs/variance.yaml\", \"w\") as config:\n", + " yaml.dump(bitch_ass_config, config)\n", + "\n", + "os.makedirs(save_dir, exist_ok=True)\n", + "\n", + "\n", + "with open(\"/content/DiffSinger/utils/hparams.py\", \"r\") as f:\n", + " hparams_py_read = f.read()\n", + "hparams_py_read = re.sub(r\"args_work_dir\\s*=\\s*.*\", f\"args_work_dir = '{save_dir}'\", hparams_py_read)\n", + "with open(\"/content/DiffSinger/utils/hparams.py\", \"w\") as f:\n", + " f.write(hparams_py_read)\n", + "\n", + "with open(\"/content/DiffSinger/utils/training_utils.py\", \"r\") as f:\n", + " training_utils_stuff = f.read()\n", + "training_utils_stuff = re.sub(\"relative_path\\s*=\\s*.*\", \"relative_path = filepath.relative_to(Path('/content').resolve())\", training_utils_stuff)\n", + "with open(\"/content/DiffSinger/utils/training_utils.py\", \"w\") as f:\n", + " f.write(training_utils_stuff)\n", + "\n", + "spk_names = [os.path.splitext(name)[0] for name in spk_names]\n", + "dict_dir = os.path.dirname(dict_path)\n", + "\n", + "print(\"config updated! see below for config's information\")\n", + "print(\"|\")\n", + "print(\"|\")\n", + "print(\"|\")\n", + "print(f\"+++---{config_cap} {singer_type} TRAINING---+++\")\n", + "print(\"|\")\n", + "print(\"|\")\n", + "print(\"|\")\n", + "print(\"+++---user's settings---+++\")\n", + "print(\"\\n\")\n", + "print(f\"speaker name: {spk_names}\")\n", + "print(\"\\n\")\n", + "print(f\"data augmentation: {data_aug}\")\n", + "print(\"\\n\")\n", + "print(f\"pitch extractor: {f0_ext}\")\n", + "print(\"\\n\")\n", + "print(f\"binary data save directory: {binary_save_dir}\")\n", + "print(\"\\n\")\n", + "print(f\"your model will be saved to: {save_dir}\")\n", + "print(\"\\n\")\n", + "print(\"==========================================================================================\")\n", + "print(\"\\n\")\n", + "print(\"+++---other auto-defined settings---+++\")\n", + "#print(\"\\n\")\n", + "#print(f\"test files (auto selected): {random_ass_test_files}\")\n", + "print(\"\\n\")\n", + "print(f\"dictionary (auto generated): {dict_dir} (check this directory)\")\n", + "print(\"\\n\")\n", + "print(\"==========================================================================================\")\n", + "print(\"\\n\")\n", + "print(\"if you don't like or disagree with any of these options,\")\n", + "print(f\"you can go and edit the config at [/content/DiffSinger/configs/{model_type}.yaml]\")\n" + ], + "metadata": { + "cellView": "form", + "id": "nI3dzDv_Mr9Y" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#@markdown # Preprocess data\n", + "import os\n", + "#we dont need that old f0 limit change anymore <3\n", + "training_config = f\"/content/DiffSinger/configs/{model_type}.yaml\"\n", + "%cd /content/DiffSinger\n", + "os.environ['PYTHONPATH']='.'\n", + "!CUDA_VISIBLE_DEVICES=0 python /content/DiffSinger/scripts/binarize.py --config {training_config} --reset" + ], + "metadata": { + "cellView": "form", + "id": "76NvDR1cXlDM" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# **Training**" + ], + "metadata": { + "id": "0J3b18EKdzMC" + } + }, + { + "cell_type": "code", + "source": [ + "#@markdown #Train your model\n", + "%cd /content/DiffSinger\n", + "import re\n", + "import os\n", + "import yaml\n", + "#@markdown ___\n", + "\n", + "#@markdown Step interval of when your model will be validate and save\n", + "save_interval = 2000 #@param {type:\"slider\", min:100, max:10000, step:100}\n", + "\n", + "#@markdown batch size setting, too low can cause bottleneck, too high can cause oom\n", + "batch_size = 9 # @param {type:\"slider\", min:1, max:100, step:1}\n", + "\n", + "#@markdown step interval of when your model will stop training automatically\n", + "max_updates = 160000 # @param {type:\"slider\", min:100, max:2000000, step:100}\n", + "\n", + "#@markdown ___\n", + "\n", + "#@markdown ###**Only edit this section if you want to resume training**\n", + "resume_training = False #@param {type:\"boolean\"}\n", + "\n", + "#@markdown select this option if you locally binarized your data | this option will only append your binary data path in your config | \"binary\" folder must be in the same directory as config.yaml\n", + "local_data = False #@param {type:\"boolean\"}\n", + "\n", + "#@markdown path to the config you got from training\n", + "re_config_path = \"\" #@param {type:\"string\"}\n", + "model_dir = os.path.dirname(re_config_path)\n", + "save_dir = model_dir\n", + "if resume_training:\n", + " with open(\"/content/DiffSinger/utils/hparams.py\", \"r\") as f:\n", + " hparams_py_read = f.read()\n", + " hparams_py_read = re.sub(r\"args_work_dir\\s*=\\s*.*\", f\"args_work_dir = '{save_dir}'\", hparams_py_read)\n", + " with open(\"/content/DiffSinger/utils/hparams.py\", \"w\") as f:\n", + " f.write(hparams_py_read)\n", + " with open(\"/content/DiffSinger/utils/training_utils.py\", \"r\") as f:\n", + " training_utils_stuff = f.read()\n", + " training_utils_stuff = re.sub(\"relative_path\\s*=\\s*.*\", \"relative_path = filepath.relative_to(Path('/content').resolve())\", training_utils_stuff)\n", + " with open(\"/content/DiffSinger/utils/training_utils.py\", \"w\") as f:\n", + " f.write(training_utils_stuff)\n", + "\n", + " config_path = re_config_path\n", + " log_dir = save_dir\n", + "\n", + " !cp {model_dir}/*.txt /content/DiffSinger/dictionaries\n", + "\n", + "else:\n", + " config_path = training_config\n", + " log_dir = conf_dir\n", + "\n", + "with open(config_path, \"r\") as config:\n", + " ehe = yaml.safe_load(config)\n", + "config_dir = os.path.dirname(config_path)\n", + "yuh = os.path.join(config_dir, \"binary\")\n", + "\n", + "ehe[\"val_check_interval\"] = save_interval\n", + "ehe[\"max_batch_size\"] = batch_size\n", + "ehe[\"max_updates\"] = max_updates\n", + "if local_data:\n", + " ehe[\"binary_data_dir\"] = yuh\n", + "with open(config_path, \"w\") as config:\n", + " yaml.dump(ehe, config)\n", + "\n", + "logs = log_dir\n", + "%reload_ext tensorboard\n", + "%tensorboard --logdir {logs}/lightning_logs\n", + "\n", + "!python /content/DiffSinger/scripts/train.py --config {config_path} --exp_name ${save_dir} --reset" + ], + "metadata": { + "cellView": "form", + "id": "Lu5w72UWgccC" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# **Convert model to ONNX format**" + ], + "metadata": { + "id": "FY40fGHEg9_i" + } + }, + { + "cell_type": "code", + "source": [ + "#@markdown # Drop Speakers from Model (Optional)\n", + "#@markdown ___\n", + "#@markdown Use this to drop speakers from your model for distribution. You will need to do it for both acoustic and variance models.\n", + "\n", + "drop_model_path = '' #@param {type: \"string\"}\n", + "#@markdown Type the ID of speakers you'd like to KEEP separated by commas. Ex: \"0,3,4\"
\n", + "#@markdown Note: You can find the ID of speakers in the model by opening the ```spk_map.json``` file in the model folder.
\n", + "#@markdown If you see ```{\"natural\": 0, \"power\": 1, \"silly\": 2}``` but only want to keep \"natural\" and \"power\", type ```0,1``` below.\n", + "retain_speakers = '' #@param {type: \"string\"}\n", + "#@markdown If you don't know what this means, don't change it.\n", + "fill_embed = 'zeros' #@param ['zeros', 'random', 'mean', 'cyclic']\n", + "\n", + "drop_out_path = drop_model_path[:-5] + '_spk-dropped.ckpt'\n", + "\n", + "!python /content/DiffSinger/scripts/drop_spk.py {drop_model_path} {drop_out_path} --retain {retain_speakers} --fill {fill_embed}\n", + "\n", + "\n" + ], + "metadata": { + "id": "21ILzW4OEnh4", + "cellView": "form" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#@markdown # Export ONNX\n", + "#@markdown ___\n", + "%cd /content\n", + "from IPython.display import clear_output\n", + "clear_output()\n", + "import os\n", + "import zipfile\n", + "import shutil\n", + "\n", + "if export_mode:\n", + " pass\n", + "else:\n", + " print(\"Installing components to make ONNX work\")\n", + " !wget -O /content/mini.sh https://repo.anaconda.com/miniconda/Miniconda3-py310_25.1.1-2-Linux-x86_64.sh\n", + " !chmod +x /content/mini.sh\n", + " !bash /content/mini.sh -b -f -p /usr/local\n", + " !conda install -q -y jupyter\n", + " !conda install -q -y google-colab -c conda-forge\n", + " !python -m ipykernel install --name \"py310\" --user\n", + " print(\"installing dependencies for ONNX conversion\")\n", + " !pip install -r /content/DiffSinger/requirements-onnx.txt -q -q -q 2>/dev/null\n", + " print(\"Installation complete, time to export those ONNX!\")\n", + "# to counter IF the user is to re-run this cell <3\n", + "if os.path.exists(\"/content/OU_compatible_files\"):\n", + " shutil.rmtree(\"/content/OU_compatible_files\")\n", + " os.remove(\"/content/jpn_dict.txt\")\n", + "else:\n", + " pass\n", + "\n", + "#@markdown select this if you don't want to see the onnx converter's output\n", + "no_output = True # @param {type:\"boolean\"}\n", + "\n", + "#@markdown path to your **ACOUSTIC CHECKPOINT** (leave blank if you don't have any): automatically use latest checkpoint that is in the same folder\n", + "acoustic_checkpoint_path = \"\" #@param{type:\"string'}\n", + "acoustic_folder_name = os.path.basename(os.path.dirname(acoustic_checkpoint_path)) + \"_acoustic\"\n", + "acoustic_folder_path = os.path.dirname(acoustic_checkpoint_path)\n", + "\n", + "#@markdown path to your **VARIANCE CHECKPOINT** (leave blank if you don't have any): automatically use latest checkpoint that is in the same folder\n", + "variance_checkpoint_path = \"\" #@param{type:\"string'}\n", + "variance_folder_name = os.path.basename(os.path.dirname(variance_checkpoint_path)) + \"_variance\"\n", + "variance_folder_path = os.path.dirname(variance_checkpoint_path)\n", + "\n", + "#@markdown path to where you want to save your ONNX files (it will create a folder named \"onnx\" in this path)\n", + "exp_folder = \"\" #@param{type:\"string\"}\n", + "\n", + "acoustic_onnx_exp = exp_folder + \"/onnx/acoustic\"\n", + "variance_onnx_exp = exp_folder + \"/onnx/variance\"\n", + "\n", + "if not acoustic_checkpoint_path:\n", + " print(\"\\n\")\n", + " print(\"acoustic ckeckpoint path not specified, not exporting acoustic ONNX...\")\n", + "else:\n", + " print(\"\\n\")\n", + " print(\"converting acoustic to onnx...\")\n", + " #cp stuff cus apparently exporter doesnt work without it\n", + " !cp {acoustic_folder_path}/config.yaml -r /content/DiffSinger/checkpoints/{acoustic_folder_name}\n", + " search_text = \" args_work_dir = os.path.join(\"\n", + " replacement = f\" args_work_dir = '{acoustic_folder_path}'\"\n", + " with open(\"/content/DiffSinger/utils/hparams.py\", \"r\") as file:\n", + " lines = file.readlines()\n", + " for i, line in enumerate(lines):\n", + " if search_text in line:\n", + " lines[i] = replacement + \"\\n\"\n", + " break\n", + " with open(\"/content/DiffSinger/utils/hparams.py\", \"w\") as file:\n", + " file.writelines(lines)\n", + " #incase if anyone wanna change it lmao\n", + " search_text_alt = \" args_work_dir = '\"\n", + " replacement_alt = f\" args_work_dir = '{acoustic_folder_path}'\"\n", + " with open(\"/content/DiffSinger/utils/hparams.py\", \"r\") as file:\n", + " lines = file.readlines()\n", + " for i, line in enumerate(lines):\n", + " if search_text_alt in line:\n", + " lines[i] = replacement_alt + \"\\n\"\n", + " break\n", + " with open(\"/content/DiffSinger/utils/hparams.py\", \"w\") as file:\n", + " file.writelines(lines)\n", + "\n", + " if no_output:\n", + " !python /content/DiffSinger/scripts/export.py acoustic --exp {acoustic_folder_name} --out {exp_folder}/onnx/acoustic >/dev/null 2>&1\n", + " else:\n", + " !python /content/DiffSinger/scripts/export.py acoustic --exp {acoustic_folder_name} --out {exp_folder}/onnx/acoustic\n", + "\n", + "\n", + "if not variance_checkpoint_path:\n", + " print(\"\\n\")\n", + " print(\"variance ckeckpoint path not specified, not exporting variance ONNX...\")\n", + "else:\n", + " print(\"\\n\")\n", + " print(\"converting variance to onnx...\")\n", + " #cp stuff cus apparently exporter doesnt work without it\n", + " !cp {variance_folder_path}/config.yaml -r /content/DiffSinger/checkpoints/{variance_folder_name}\n", + " search_text = \" args_work_dir = os.path.join(\"\n", + " replacement = f\" args_work_dir = '{variance_folder_path}'\"\n", + " with open(\"/content/DiffSinger/utils/hparams.py\", \"r\") as file:\n", + " lines = file.readlines()\n", + " for i, line in enumerate(lines):\n", + " if search_text in line:\n", + " lines[i] = replacement + \"\\n\"\n", + " break\n", + " with open(\"/content/DiffSinger/utils/hparams.py\", \"w\") as file:\n", + " file.writelines(lines)\n", + " #incase if anyone wanna change it lmao\n", + " search_text_alt = \" args_work_dir = '\"\n", + " replacement_alt = f\" args_work_dir = '{variance_folder_path}'\"\n", + " with open(\"/content/DiffSinger/utils/hparams.py\", \"r\") as file:\n", + " lines = file.readlines()\n", + " for i, line in enumerate(lines):\n", + " if search_text_alt in line:\n", + " lines[i] = replacement_alt + \"\\n\"\n", + " break\n", + " with open(\"/content/DiffSinger/utils/hparams.py\", \"w\") as file:\n", + " file.writelines(lines)\n", + " if no_output:\n", + " !python /content/DiffSinger/scripts/export.py variance --exp {variance_folder_name} --out {exp_folder}/onnx/variance >/dev/null 2>&1\n", + " else:\n", + " !python /content/DiffSinger/scripts/export.py variance --exp {variance_folder_name} --out {exp_folder}/onnx/variance\n", + "\n", + "\n", + "if not variance_checkpoint_path:\n", + " folder_paths = [acoustic_onnx_exp]\n", + "elif not acoustic_checkpoint_path:\n", + " folder_paths = [variance_onnx_exp]\n", + "else:\n", + " folder_paths = [acoustic_onnx_exp, variance_onnx_exp]\n", + "\n", + "patterns = {\"acoustic.onnx\": \"acoustic.onnx\", \"dur.onnx\": \"dur.onnx\", \"linguistic.onnx\": \"linguistic.onnx\", \"pitch.onnx\": \"pitch.onnx\", \"variance.onnx\": \"variance.onnx\", \"phonemes.txt\": \"phonemes.txt\"}\n", + "\n", + "for folder_path in folder_paths:\n", + " for filename in os.listdir(folder_path):\n", + " for pattern, new_name in patterns.items():\n", + " if pattern in filename:\n", + " old_path = os.path.join(folder_path, filename)\n", + " new_path = os.path.join(folder_path, new_name)\n", + " if os.path.exists(old_path):\n", + " os.rename(old_path, new_path)\n", + "for folder_path in folder_paths:\n", + " for filename in os.listdir(folder_path):\n", + " if \"acoustic_acoustic.\" in filename:\n", + " new_filename = filename.replace(\"acoustic_acoustic.\", \"acoustic_\")\n", + " elif \"variance_variance.\" in filename:\n", + " new_filename = filename.replace(\"variance_variance.\", \"variance_\")\n", + " else:\n", + " new_filename = filename\n", + " old_path = os.path.join(folder_path, filename)\n", + " new_path = os.path.join(folder_path, new_filename)\n", + " os.rename(old_path, new_path)\n", + "print(\"\\n\")\n", + "print(\"ONNX export complete! Please refer to https://github.com/xunmengshe/OpenUtau/wiki/Voicebank-Development to make your model OU compatible\")\n", + "print(\"\\n\")\n", + "print(\"Or use the 'Build OpenUtau VB' cell to have things set up for you\")\n" + ], + "metadata": { + "id": "x33iZhZchEMW", + "cellView": "form" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# **Miscellaneous**" + ], + "metadata": { + "id": "4sbU1aH5kGFE" + } + }, + { + "cell_type": "code", + "source": [ + "#@title #Raw data conversion\n", + "#@markdown ___\n", + "%cd /content\n", + "#@markdown This cell will export .lab and .ds files along with your data\n", + "\n", + "data_type = \"lab + wav (NNSVS format)\" # @param [\"lab + wav (NNSVS format)\"]\n", + "\n", + "#@markdown The path to your data zip file\n", + "\n", + "data_zip_path = \"\" #@param {type:\"string\"}\n", + "\n", + "#@markdown The path you will be saving the data to\n", + "\n", + "data_save_path = \"\" #@param {type:\"string\"}\n", + "\n", + "#@markdown ___\n", + "\n", + "export_ds = True\n", + "\n", + "#@markdown _These values can exceed the amount that's in your data to maximize the segment length or to keep the data as is_\n", + "\n", + "#@markdown Determine how long it will segment your data to based on silence phoneme placement (seconds)\n", + "segment_length = 15 #@param {type:\"slider\", min:5, max:35, step:1}\n", + "\n", + "#@markdown Determine how many silence phoneme is allowed in the middle of each segment\n", + "max_silence_phoneme_amount = 2 #@param {type:\"slider\", min:0, max:50, step:1}\n", + "\n", + "# leaving -S at 60 so max silence can be 60 seconds that exceeds the segment legnth cap idk why///\n", + "# making the segment length cap at 35 secs because any longer than that would make training goes really slow\n", + "\n", + "# my ass dont remember why i made two... i think one is unnecessary extra but mehhh\n", + "all_shits = \"/content/raw_data\"\n", + "all_shits_not_wav_n_lab = \"/content/raw_data/diffsinger_db\"\n", + "\n", + "import os\n", + "import csv\n", + "import json\n", + "import shutil\n", + "from pydub import AudioSegment\n", + "\n", + "if os.path.exists(\"/content/raw_data\"):\n", + " shutil.rmtree(\"/content/raw_data\")\n", + "\n", + "if not os.path.exists(all_shits_not_wav_n_lab):\n", + " os.makedirs(all_shits_not_wav_n_lab)\n", + "\n", + "# using 'if not' bc i edited the wrong section which im also too lazy to fix it <3\n", + "if not data_type == \"lab + wav (NNSVS format)\":\n", + " #changed to 7zip to support more compression types\n", + " !7z x \"$data_zip_path\" -o{all_shits_not_wav_n_lab}\n", + " for root, dirs, files in os.walk(all_shits):\n", + " for filename in files:\n", + " if filename.endswith(\".lab\"):\n", + " file_path = os.path.join(root, filename)\n", + " with open(file_path, \"r\") as file:\n", + " file_data = file.read()\n", + " file_data = file_data.replace(\"SP\", \"pau\")\n", + " file_data = file_data.replace(\"br\", \"AP\")\n", + " with open(file_path, \"w\") as file:\n", + " file.write(file_data)\n", + "\n", + "else:\n", + " !7z x \"$data_zip_path\" -o{all_shits_not_wav_n_lab}\n", + "\n", + "\n", + "# for funny auto dict generator lmao\n", + "out = \"/content/raw_data/custom_dict.txt\"\n", + "\n", + "phonemes = set()\n", + "\n", + "def is_excluded(phoneme):\n", + " return phoneme in [\"pau\", \"AP\", \"SP\"]\n", + "\n", + "if data_type == \"lab + wav (NNSVS format)\":\n", + " phoneme_folder_path = all_shits\n", + " for root, dirs, files in os.walk(phoneme_folder_path):\n", + " for file in files:\n", + " if file.endswith(\".lab\"):\n", + " fpath = os.path.join(root, file)\n", + " with open(fpath, \"r\") as lab_file:\n", + " for line in lab_file:\n", + " line = line.strip()\n", + " if line:\n", + " phoneme = line.split()[2]\n", + " if not is_excluded(phoneme):\n", + " phonemes.add(phoneme)\n", + "\n", + "with open(out, \"w\") as f:\n", + " for phoneme in sorted(phonemes):\n", + " f.write(phoneme + \"\t\" + phoneme + \"\\n\")\n", + "\n", + "# for vowels and consonants.txt.... well adding liquid type for uta's script\n", + "dict_path = out\n", + "vowel_types = {\"a\", \"i\", \"u\", \"e\", \"o\", \"N\", \"M\", \"NG\"}\n", + "liquid_types = {\"y\", \"w\", \"l\", \"r\"} # r for english labels, it should be fine with jp too\n", + "vowel_data = []\n", + "consonant_data = []\n", + "liquid_data = []\n", + "\n", + "with open(dict_path, \"r\") as f:\n", + " for line in f:\n", + " phoneme, _ = line.strip().split(\"\\t\")\n", + " if phoneme[0] in vowel_types:\n", + " vowel_data.append(phoneme)\n", + " elif phoneme[0] in liquid_types:\n", + " liquid_data.append(phoneme)\n", + " else:\n", + " consonant_data.append(phoneme)\n", + "\n", + "vowel_data.sort()\n", + "liquid_data.sort()\n", + "consonant_data.sort()\n", + "directory = os.path.dirname(dict_path)\n", + "\n", + "# make txt for language json file\n", + "vowel_txt_path = os.path.join(directory, \"vowels.txt\")\n", + "with open(vowel_txt_path, \"w\") as f:\n", + " f.write(\" \".join(vowel_data))\n", + "liquid_txt_path = os.path.join(directory, \"liquids.txt\")\n", + "with open(liquid_txt_path, \"w\") as f:\n", + " f.write(\" \".join(liquid_data))\n", + "consonant_txt_path = os.path.join(directory, \"consonants.txt\")\n", + "with open(consonant_txt_path, \"w\") as f:\n", + " f.write(\" \".join(consonant_data))\n", + "\n", + "\n", + "# here's a funny json append\n", + "with open(vowel_txt_path, \"r\") as f:\n", + " vowel_data = f.read().split()\n", + "with open(liquid_txt_path, \"r\") as f:\n", + " liquid_data = f.read().split()\n", + "with open(consonant_txt_path, \"r\") as f:\n", + " consonant_data = f.read().split()\n", + "phones4json = {\"vowels\": vowel_data, \"liquids\": liquid_data}\n", + "with open(\"/content/nnsvs-db-converter/lang.sample.json\", \"w\") as rawr:\n", + " json.dump(phones4json, rawr, indent=4)\n", + "\n", + "\n", + "if data_type == \"lab + wav (NNSVS format)\":\n", + " db_converter_script = \"/content/nnsvs-db-converter/db_converter.py\"\n", + " for raw_folder_name in os.listdir(all_shits_not_wav_n_lab):\n", + " raw_folder_path = os.path.join(all_shits_not_wav_n_lab, raw_folder_name)\n", + " if os.path.isdir(raw_folder_path):\n", + " !python {db_converter_script} -s {max_silence_phoneme_amount} -S 60 -l {segment_length} ${export_lab} -mD -c -L \"/content/nnsvs-db-converter/lang.sample.json\" -w htk --folder {raw_folder_path}\n", + "\n", + "if data_type == \"lab + wav (NNSVS format)\":\n", + " for raw_folder_name in os.listdir(all_shits_not_wav_n_lab):\n", + " raw_folder_path = os.path.join(all_shits_not_wav_n_lab, raw_folder_name)\n", + " !rm -rf {raw_folder_path}/*.wav {raw_folder_path}/*.lab\n", + " !mv {raw_folder_path}/diffsinger_db/* {raw_folder_path} 2> /dev/null\n", + " !rm -rf {raw_folder_path}/diffsinger_db\n", + " #!cp {raw_folder_path}/wavs/*.wav {raw_folder_path}\n", + "\n", + "# make it replace the first SP to AP cus it seems like people always forgot about it\n", + "for root, _, files in os.walk(all_shits_not_wav_n_lab):\n", + " for file in files:\n", + " if file.endswith(\".csv\"):\n", + " file_path = os.path.join(root, file)\n", + " with open(file_path, \"r\", newline=\"\") as input_file:\n", + " csv_reader = csv.reader(input_file)\n", + " data = [row for row in csv_reader]\n", + " header = data[0]\n", + " if \"ph_seq\" in header:\n", + " ph_seq_index = header.index(\"ph_seq\")\n", + " if len(data) > 1 and len(data[1]) > ph_seq_index:\n", + " data[1][ph_seq_index] = data[1][ph_seq_index].replace(\"SP\", \"AP\", 1)\n", + " with open(file_path, \"w\", newline=\"\") as output_file:\n", + " csv_writer = csv.writer(output_file)\n", + " csv_writer.writerows(data)\n", + "\n", + "print(\"extraction complete!\")\n", + "print(\"\\n\")\n", + "print(\"zipping up files...\")\n", + "!zip -q -9 -r {data_save_path}/data.zip /content/raw_data/*" + ], + "metadata": { + "cellView": "form", + "id": "AI7EQ2jQkGEq" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#@markdown # Build OpenUtau VB\n", + "#@markdown ___\n", + "#i need to clean this up it seems\n", + "#plan: add a build ou section here by inserting onnx paths (or just the folder containing the folders to the onnx files) to build ou\n", + "# ill have a config read function too so i dont have to add checkmark of if people train with embeds or shallow diff or not <3\n", + "# yes im lazy rawr x3\n", + "%cd /content\n", + "import os\n", + "import shutil\n", + "import yaml\n", + "from IPython.display import clear_output\n", + "\n", + "constr_folder = \"/content/OU_voicebank\"\n", + "if not os.path.exists(constr_folder):\n", + " os.makedirs(constr_folder)\n", + "else:\n", + " shutil.rmtree(constr_folder)\n", + " os.makedirs(constr_folder)\n", + "\n", + "clear_output()\n", + "\n", + "#@markdown path to your **ACOUSTIC ONNX FOLDER**\n", + "acoustic_onnx_folder = \"\" #@param{type:\"string'}\n", + "#@markdown path to the config.yaml of acoustic model\n", + "acoustic_config = \"\" #@param{type:\"string'}\n", + "\n", + "#@markdown path to your **VARIANCE ONNX FOLDER**\n", + "variance_onnx_folder = \"\" #@param{type:\"string'}\n", + "#@markdown path to the config.yaml of variance model\n", + "variance_config = \"\" #@param{type:\"string'}\n", + "\n", + "#@markdown path to your word to phoneme dict (leave blank to use default Japanese dict)\n", + "dictionary_path = \"\" #@param{type:\"string\"}\n", + "\n", + "#@markdown path to the folder you want to save the zip file to\n", + "save_path = \"\" #@param{type:\"string\"}\n", + "\n", + "#@markdown ___\n", + "\n", + "#@markdown ## Character Configuration | character.txt and character.yaml\n", + "\n", + "#@markdown your character display name| **required**\n", + "name = \"\" #@param{type:\"string\"}\n", + "\n", + "print(\"copying files...\")\n", + "main_stuff = f\"{constr_folder}/{name}\"\n", + "if not os.path.exists(main_stuff):\n", + " os.makedirs(main_stuff)\n", + "if not os.path.exists(f\"{main_stuff}/dsmain\"):\n", + " os.makedirs(f\"{main_stuff}/dsmain/embeds/acoustic\")\n", + " os.makedirs(f\"{main_stuff}/dsmain/embeds/variance\")\n", + "!cp {acoustic_onnx_folder}/acoustic.onnx {main_stuff}/dsmain\n", + "!cp {acoustic_onnx_folder}/phonemes.txt {main_stuff}/dsmain\n", + "!cp {acoustic_onnx_folder}/*.emb {main_stuff}/dsmain/embeds/acoustic >/dev/null 2>&1\n", + "!cp {variance_onnx_folder}/*.emb {main_stuff}/dsmain/embeds/variance >/dev/null 2>&1\n", + "\n", + "if variance_onnx_folder:\n", + " !cp {variance_onnx_folder}/linguistic.onnx {main_stuff}/dsmain\n", + "else:\n", + " pass\n", + "\n", + "print(\"\\n\")\n", + "print(\"writing character.txt...\")\n", + "with open(f\"{main_stuff}/character.txt\", \"w\") as file:\n", + " file.write(f\"name={name}\\n\")\n", + " file.write(\"image=\\n\")\n", + " file.write(\"author=\\n\")\n", + " file.write(\"voice=\\n\")\n", + " file.write(\"web=\\n\")\n", + "\n", + "print(\"\\n\")\n", + "print(\"writing character.yaml...\")\n", + "with open(f\"{main_stuff}/character.yaml\", \"w\") as file:\n", + " file.write(\"text_file_encoding: utf-8\\n\")\n", + " file.write(\"portrait:\\n\")\n", + " file.write(\"portrait_opacity: 0.45\\n\")\n", + " file.write(\"default_phonemizer: OpenUtau.Core.DiffSinger.DiffSingerPhonemizer\\n\")\n", + " file.write(\"singer_type: diffsinger\\n\")\n", + "acoustic_emb_files = os.listdir(acoustic_onnx_folder)\n", + "acoustic_embeds = []\n", + "acoustic_color_suffix = []\n", + "for file in acoustic_emb_files:\n", + " if file.endswith(\".emb\"):\n", + " acoustic_emb = os.path.splitext(file)[0]\n", + " acoustic_embeds.append(\"dsmain/embeds/acoustic/\" + acoustic_emb)\n", + " acoustic_color_suffix.append(acoustic_emb)\n", + "subbanks = []\n", + "for i, (acoustic_embed_color, acoustic_embed_suffix) in enumerate(zip(acoustic_color_suffix, acoustic_embeds), start=1):\n", + " color = f\"{i:02}: {acoustic_embed_color}\"\n", + " suffix = f\"{acoustic_embed_suffix}\"\n", + " subbanks.append({\"color\": color, \"suffix\": suffix})\n", + "if subbanks:\n", + " with open(f\"{main_stuff}/character.yaml\", \"r\") as config:\n", + " i_wanna_die_slash_j = yaml.safe_load(config)\n", + " i_wanna_die_slash_j[\"subbanks\"] = subbanks\n", + " with open(f\"{main_stuff}/character.yaml\", \"w\") as config:\n", + " yaml.dump(i_wanna_die_slash_j, config)\n", + "\n", + "print(\"\\n\")\n", + "print(\"writing dsconfig.yaml for acoustic...\")\n", + "with open(f\"{main_stuff}/dsconfig.yaml\", \"w\") as file:\n", + " file.write(\"phonemes: dsmain/phonemes.txt\\n\")\n", + " file.write(\"acoustic: dsmain/acoustic.onnx\\n\")\n", + " file.write(\"vocoder: nsf_hifigan\\n\")\n", + " file.write(\"singer_type: diffsinger\\n\")\n", + "with open(acoustic_config, \"r\") as config:\n", + " mfking_config = yaml.safe_load(config)\n", + "use_energy_embed = mfking_config.get(\"use_energy_embed\")\n", + "use_breathiness_embed = mfking_config.get(\"use_breathiness_embed\")\n", + "use_shallow_diffusion = mfking_config.get(\"use_shallow_diffusion\")\n", + "max_depth = mfking_config.get(\"T_start\")\n", + "speakers = mfking_config.get(\"speakers\") #looking back here, why is this even here lmao cus i used acoustic_embeds instead of speakers\n", + "augmentation_arg = mfking_config.get(\"augmentation_args\")\n", + "pitch_aug = mfking_config.get(\"use_key_shift_embed\")\n", + "time_aug = mfking_config.get(\"use_speed_embed\")\n", + "voicing = mfking_config.get(\"use_voicing_embed\")\n", + "tension = mfking_config.get(\"use_tension_embed\")\n", + "sample_rate = mfking_config.get(\"audio_sample_rate\")\n", + "hop_size = mfking_config.get(\"hop_size\")\n", + "win_size = mfking_config.get(\"win_size\")\n", + "fft_size = mfking_config.get(\"fft_size\")\n", + "num_mel_bins = mfking_config.get(\"audio_num_mel_bins\")\n", + "mel_fmin = mfking_config.get(\"fmin\")\n", + "mel_fmax = mfking_config.get(\"fmax\")\n", + "mel_base = mfking_config.get(\"mel_base\")\n", + "\n", + "with open(f\"{main_stuff}/dsconfig.yaml\", \"r\") as config:\n", + " why_are_there_so_many_i_could_prob_make_it_one = yaml.safe_load(config)\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"use_energy_embed\"] = use_energy_embed\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"use_breathiness_embed\"] = use_breathiness_embed\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"use_variable_depth\"] = use_shallow_diffusion\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"max_depth\"] = max_depth\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"augmentation_args\"] = augmentation_arg\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"use_key_shift_embed\"] = pitch_aug\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"use_speed_embed\"] = time_aug\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"use_voicing_embed\"] = voicing\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"use_tension_embed\"] = tension\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"use_continuous_acceleration\"] = True\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"sample_rate\"] = sample_rate\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"hop_size\"] = hop_size\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"win_size\"] = win_size\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"fft_size\"] = fft_size\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"num_mel_bins\"] = num_mel_bins\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"fmin\"] = mel_fmin\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"fmax\"] = mel_fmax\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"mel_base\"] = mel_base\n", + "why_are_there_so_many_i_could_prob_make_it_one[\"mel_scale\"] = \"slaney\"\n", + "\n", + "\n", + "if subbanks:\n", + " why_are_there_so_many_i_could_prob_make_it_one[\"speakers\"] = acoustic_embeds\n", + "with open(f\"{main_stuff}/dsconfig.yaml\", \"w\") as config:\n", + " yaml.dump(why_are_there_so_many_i_could_prob_make_it_one, config)\n", + "\n", + "\n", + "variance_emb_files = os.listdir(variance_onnx_folder)\n", + "variance_embeds = []\n", + "for file in variance_emb_files:\n", + " if file.endswith(\".emb\"):\n", + " variance_emb = os.path.splitext(file)[0]\n", + " variance_embeds.append(\"../dsmain/embeds/variance/\" + variance_emb)\n", + "\n", + "print(\"\\n\")\n", + "print(\"writing dsdict.yaml...\")\n", + "if not dictionary_path:\n", + " dict_path = \"/content/jpn_dict.txt\"\n", + "else:\n", + " dict_path = dictionary_path\n", + "\n", + "# for symbols list\n", + "phoneme_dict_path = f\"{acoustic_onnx_folder}/dictionary.txt\"\n", + "\n", + "dsdict = \"dsdict.yaml\"\n", + "\n", + "def parse_phonemes(phonemes_str):\n", + " return phonemes_str.split()\n", + "\n", + "entries = []\n", + "vowel_types = {\"a\", \"i\", \"u\", \"e\", \"o\", \"N\", \"M\", \"NG\", \"cl\", \"vf\"}\n", + "vowel_data = []\n", + "stop_data = []\n", + "\n", + "# Process the specified dictionary\n", + "with open(dict_path, \"r\") as f:\n", + " for line in f:\n", + " word, phonemes_str = line.strip().split(\"\\t\")\n", + " phonemes = parse_phonemes(phonemes_str)\n", + " if len(phonemes) == 1:\n", + " entries.append({\"grapheme\": word, \"phonemes\": phonemes})\n", + " else:\n", + " entries.append({\"grapheme\": word, \"phonemes\": phonemes})\n", + "\n", + "with open(phoneme_dict_path, \"r\") as f:\n", + " for line in f:\n", + " phoneme, _ = line.strip().split(\"\\t\")\n", + " phoneme_type = \"vowel\" if phoneme[0] in vowel_types else \"stop\"\n", + " entry = {\"symbol\": phoneme, \"type\": phoneme_type}\n", + " if phoneme_type == \"vowel\":\n", + " vowel_data.append(entry)\n", + " else:\n", + " stop_data.append(entry)\n", + "\n", + "vowel_data.sort(key=lambda x: x[\"symbol\"])\n", + "stop_data.sort(key=lambda x: x[\"symbol\"])\n", + "\n", + "dsdict_path = os.path.join(constr_folder, dsdict)\n", + "with open(dsdict_path, \"w\") as f:\n", + " f.write(\"entries:\\n\")\n", + " for entry in entries:\n", + " f.write(f\"- grapheme: {entry['grapheme']}\\n\")\n", + " f.write(\" phonemes:\\n\")\n", + " for phoneme in entry[\"phonemes\"]:\n", + " f.write(f\" - {phoneme}\\n\")\n", + "\n", + " f.write(\"\\nsymbols:\\n\")\n", + " for entry in vowel_data + stop_data:\n", + " f.write(f\"- symbol: {entry['symbol']}\\n\")\n", + " f.write(f\" type: {entry['type']}\\n\")\n", + "\n", + "with open(variance_config, \"r\") as config:\n", + " mfking_config = yaml.safe_load(config)\n", + "sample_rate = mfking_config.get(\"audio_sample_rate\")\n", + "hop_size = mfking_config.get(\"hop_size\")\n", + "predict_dur = mfking_config.get(\"predict_dur\")\n", + "predict_pitch = mfking_config.get(\"predict_pitch\")\n", + "use_melody_encoder = mfking_config.get(\"use_melody_encoder\")\n", + "predict_voicing = mfking_config.get(\"predict_voicing\")\n", + "predict_tension = mfking_config.get(\"predict_tension\")\n", + "predict_energy = mfking_config.get(\"predict_energy\")\n", + "predict_breathiness = mfking_config.get(\"predict_breathiness\")\n", + "\n", + "dur_onnx_path = variance_onnx_folder + \"/dur.onnx\"\n", + "if os.path.exists(dur_onnx_path):\n", + " print(\"\\n\")\n", + " print(\"making dsdur directory and necessary files...\")\n", + " os.makedirs(f\"{main_stuff}/dsdur\")\n", + " !cp {dur_onnx_path} {main_stuff}/dsdur\n", + " !cp {dsdict_path} {main_stuff}/dsdur\n", + " with open(f\"{main_stuff}/dsdur/dsconfig.yaml\", \"w\") as file:\n", + " file.write(\"phonemes: ../dsmain/phonemes.txt\\n\")\n", + " file.write(\"linguistic: ../dsmain/linguistic.onnx\\n\")\n", + " file.write(\"dur: dur.onnx\\n\")\n", + " with open(f\"{main_stuff}/dsdur/dsconfig.yaml\", \"r\") as config:\n", + " dsdur_config = yaml.safe_load(config)\n", + " dsdur_config[\"use_continuous_acceleration\"] = True\n", + " dsdur_config[\"sample_rate\"] = sample_rate\n", + " dsdur_config[\"hop_size\"] = hop_size\n", + " dsdur_config[\"predict_dur\"] = predict_dur\n", + " if subbanks:\n", + " dsdur_config[\"speakers\"] = variance_embeds\n", + " with open(f\"{main_stuff}/dsdur/dsconfig.yaml\", \"w\") as config:\n", + " yaml.dump(dsdur_config, config)\n", + "else:\n", + " print(\"\\n\")\n", + " print(\"dur.onnx not found, skipping on making dsdur folder...\")\n", + "\n", + "pitch_onnx_path = variance_onnx_folder + \"/pitch.onnx\"\n", + "if os.path.exists(pitch_onnx_path):\n", + " print(\"\\n\")\n", + " print(\"making dspitch directory and necessary files...\")\n", + " os.makedirs(f\"{main_stuff}/dspitch\")\n", + " !cp {pitch_onnx_path} {main_stuff}/dspitch\n", + " !cp {dsdict_path} {main_stuff}/dspitch\n", + " with open(f\"{main_stuff}/dspitch/dsconfig.yaml\", \"w\") as file:\n", + " file.write(\"phonemes: ../dsmain/phonemes.txt\\n\")\n", + " file.write(\"linguistic: ../dsmain/linguistic.onnx\\n\")\n", + " file.write(\"pitch: pitch.onnx\\n\")\n", + " file.write(\"use_expr: true\\n\")\n", + " with open(f\"{main_stuff}/dspitch/dsconfig.yaml\", \"r\") as config:\n", + " dspitch_config = yaml.safe_load(config)\n", + " dspitch_config[\"use_continuous_acceleration\"] = True\n", + " dspitch_config[\"sample_rate\"] = sample_rate\n", + " dspitch_config[\"hop_size\"] = hop_size\n", + " dspitch_config[\"predict_dur\"] = predict_pitch\n", + " if subbanks:\n", + " dspitch_config[\"speakers\"] = variance_embeds\n", + " dspitch_config[\"use_note_rest\"] = use_melody_encoder\n", + " with open(f\"{main_stuff}/dspitch/dsconfig.yaml\", \"w\") as config:\n", + " yaml.dump(dspitch_config, config)\n", + "else:\n", + " print(\"\\n\")\n", + " print(\"pitch.onnx not found, skipping on making dspitch folder...\")\n", + "\n", + "variance_onnx_path = variance_onnx_folder + \"/variance.onnx\"\n", + "if os.path.exists(variance_onnx_path):\n", + " print(\"\\n\")\n", + " print(\"making dsvariance directory and necessary files...\")\n", + " os.makedirs(f\"{main_stuff}/dsvariance\")\n", + " !cp {variance_onnx_path} {main_stuff}/dsvariance\n", + " !cp {dsdict_path} {main_stuff}/dsvariance\n", + " with open(f\"{main_stuff}/dsvariance/dsconfig.yaml\", \"w\") as file:\n", + " file.write(\"phonemes: ../dsmain/phonemes.txt\\n\")\n", + " file.write(\"linguistic: ../dsmain/linguistic.onnx\\n\")\n", + " file.write(\"variance: variance.onnx\\n\")\n", + " with open(f\"{main_stuff}/dsvariance/dsconfig.yaml\", \"r\") as config:\n", + " dsvariance_config = yaml.safe_load(config)\n", + " dsvariance_config[\"use_continuous_acceleration\"] = True\n", + " dsvariance_config[\"sample_rate\"] = sample_rate\n", + " dsvariance_config[\"hop_size\"] = hop_size\n", + " dsvariance_config[\"predict_dur\"] = True #this one will always be true cus if there's no variance model, it shouldnt make this folder in the first place\n", + " dsvariance_config[\"predict_voicing\"] = predict_voicing\n", + " dsvariance_config[\"predict_tension\"] = predict_tension\n", + " dsvariance_config[\"predict_energy\"] = predict_energy\n", + " dsvariance_config[\"predict_breathiness\"] = predict_breathiness\n", + " if subbanks:\n", + " dsvariance_config[\"speakers\"] = variance_embeds\n", + " with open(f\"{main_stuff}/dsvariance/dsconfig.yaml\", \"w\") as config:\n", + " yaml.dump(dsvariance_config, config)\n", + "else:\n", + " print(\"\\n\")\n", + " print(\"variance.onnx not found, skipping on making dsvariance folder...\")\n", + "\n", + "!rm -rf {dsdict_path}\n", + "#im too lazy to write codes so ill just do this, itll only remove those folders if they're empty anyway\n", + "!rm -d {main_stuff}/dsmain/embeds/* >/dev/null 2>&1\n", + "!rm -d {main_stuff}/dsmain/embeds >/dev/null 2>&1\n", + "\n", + "print(\"\\n\")\n", + "print(\"zipping up files...\")\n", + "!zip -q -9 -r {save_path}/{name}.zip {main_stuff}/*\n", + "\n", + "print(\"\\n\")\n", + "print(\"done!\")\n", + "\n", + "print(\"\\n\")\n", + "print(\"You can download your model zip and use it in OpenUtau! If anything needed to be edit in the config then please do so\")" + ], + "metadata": { + "cellView": "form", + "id": "A70Sc3Hbmxh0" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file From 41943aecd689092362a9e0ad0fc1e188b088ba48 Mon Sep 17 00:00:00 2001 From: DELTA SYNTH <105579737+deltaVOCALOID09378@users.noreply.github.com> Date: Thu, 18 Dec 2025 12:06:06 +0700 Subject: [PATCH 07/12] =?UTF-8?q?=E0=B8=AA=E0=B8=A3=E0=B9=89=E0=B8=B2?= =?UTF-8?q?=E0=B8=87=E0=B9=82=E0=B8=94=E0=B8=A2=E0=B9=83=E0=B8=8A=E0=B9=89?= =?UTF-8?q?=20Colab?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DiffSinger_colab_notebook.ipynb | 127 +++++++++++++++++++++++++++++--- 1 file changed, 115 insertions(+), 12 deletions(-) diff --git a/DiffSinger_colab_notebook.ipynb b/DiffSinger_colab_notebook.ipynb index e1a4deaba..74f4a3461 100644 --- a/DiffSinger_colab_notebook.ipynb +++ b/DiffSinger_colab_notebook.ipynb @@ -12,7 +12,7 @@ "FY40fGHEg9_i", "4sbU1aH5kGFE" ], - "gpuType": "T4", + "gpuType": "V5E1", "include_colab_link": true }, "kernelspec": { @@ -22,7 +22,7 @@ "language_info": { "name": "python" }, - "accelerator": "GPU" + "accelerator": "TPU" }, "cells": [ { @@ -75,13 +75,59 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "id": "pK8aicf8A2sj", "cellView": "form", - "collapsed": true + "collapsed": true, + "outputId": "f68cea87-c72d-44e3-bef7-a902659b5088", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 610 + } }, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "setup complete!\n", + "|\n", + "|\n", + "|\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "" + ] + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {}, + "execution_count": 1 + } + ], "source": [ "from IPython.display import clear_output, Audio, display, HTML\n", "import os\n", @@ -147,7 +193,7 @@ " clear_output()\n", "\n", "#@title # Mount Google Drive and Setup\n", - "export_mode = False # @param {\"type\":\"boolean\"}\n", + "export_mode = True # @param {\"type\":\"boolean\"}\n", "drive.mount(\"/content/drive\")\n", "\n", "if export_mode:\n", @@ -503,10 +549,49 @@ ], "metadata": { "cellView": "form", - "id": "JsP1TGg2F1g3" + "id": "JsP1TGg2F1g3", + "outputId": "1223a04e-0038-4031-8d2f-26f2be92fb45", + "colab": { + "base_uri": "https://localhost:8080/" + } }, - "execution_count": null, - "outputs": [] + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "/content\n", + "\n", + "7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21\n", + "p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.00GHz (50653),ASM,AES-NI)\n", + "\n", + "\n", + "\n", + "Command Line Error:\n", + "Cannot find archive name\n", + "extraction complete!\n", + "|\n", + "|\n", + "|\n", + "I'm also nice enough to convert your data and also write your dictionaries lmao. You are welcome :)\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.12/dist-packages/pydub/utils.py:300: SyntaxWarning: invalid escape sequence '\\('\n", + " m = re.match('([su]([0-9]{1,2})p?) \\(([0-9]{1,2}) bit\\)$', token)\n", + "/usr/local/lib/python3.12/dist-packages/pydub/utils.py:301: SyntaxWarning: invalid escape sequence '\\('\n", + " m2 = re.match('([su]([0-9]{1,2})p?)( \\(default\\))?$', token)\n", + "/usr/local/lib/python3.12/dist-packages/pydub/utils.py:310: SyntaxWarning: invalid escape sequence '\\('\n", + " elif re.match('(flt)p?( \\(default\\))?$', token):\n", + "/usr/local/lib/python3.12/dist-packages/pydub/utils.py:314: SyntaxWarning: invalid escape sequence '\\('\n", + " elif re.match('(dbl)p?( \\(default\\))?$', token):\n" + ] + } + ] }, { "cell_type": "code", @@ -897,10 +982,28 @@ ], "metadata": { "cellView": "form", - "id": "nI3dzDv_Mr9Y" + "id": "nI3dzDv_Mr9Y", + "outputId": "b08a969d-968e-4ed9-d82f-32b1ed008316", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 245 + } }, - "execution_count": null, - "outputs": [] + "execution_count": 3, + "outputs": [ + { + "output_type": "error", + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: ''", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/tmp/ipython-input-3168009205.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 332\u001b[0m \u001b[0myaml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdump\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbitch_ass_config\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 333\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 334\u001b[0;31m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmakedirs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msave_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexist_ok\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 335\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 336\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/lib/python3.12/os.py\u001b[0m in \u001b[0;36mmakedirs\u001b[0;34m(name, mode, exist_ok)\u001b[0m\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: ''" + ] + } + ] }, { "cell_type": "code", From 188f7e4837ff94889ff4e7c5e64c3c9ce8d0070c Mon Sep 17 00:00:00 2001 From: DELTA SYNTH <105579737+deltaVOCALOID09378@users.noreply.github.com> Date: Sun, 22 Feb 2026 16:40:15 +0700 Subject: [PATCH 08/12] =?UTF-8?q?=E0=B8=AA=E0=B8=A3=E0=B9=89=E0=B8=B2?= =?UTF-8?q?=E0=B8=87=E0=B9=82=E0=B8=94=E0=B8=A2=E0=B9=83=E0=B8=8A=E0=B9=89?= =?UTF-8?q?=20Colab?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DiffSinger_colab_notebook.ipynb | 127 +++----------------------------- 1 file changed, 12 insertions(+), 115 deletions(-) diff --git a/DiffSinger_colab_notebook.ipynb b/DiffSinger_colab_notebook.ipynb index 74f4a3461..e1a4deaba 100644 --- a/DiffSinger_colab_notebook.ipynb +++ b/DiffSinger_colab_notebook.ipynb @@ -12,7 +12,7 @@ "FY40fGHEg9_i", "4sbU1aH5kGFE" ], - "gpuType": "V5E1", + "gpuType": "T4", "include_colab_link": true }, "kernelspec": { @@ -22,7 +22,7 @@ "language_info": { "name": "python" }, - "accelerator": "TPU" + "accelerator": "GPU" }, "cells": [ { @@ -75,59 +75,13 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "id": "pK8aicf8A2sj", "cellView": "form", - "collapsed": true, - "outputId": "f68cea87-c72d-44e3-bef7-a902659b5088", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 610 - } + "collapsed": true }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "setup complete!\n", - "|\n", - "|\n", - "|\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "" - ], - "text/html": [ - "" - ] - }, - "metadata": {} - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "" - ], - "text/html": [ - "\n", - " \n", - " " - ] - }, - "metadata": {}, - "execution_count": 1 - } - ], + "outputs": [], "source": [ "from IPython.display import clear_output, Audio, display, HTML\n", "import os\n", @@ -193,7 +147,7 @@ " clear_output()\n", "\n", "#@title # Mount Google Drive and Setup\n", - "export_mode = True # @param {\"type\":\"boolean\"}\n", + "export_mode = False # @param {\"type\":\"boolean\"}\n", "drive.mount(\"/content/drive\")\n", "\n", "if export_mode:\n", @@ -549,49 +503,10 @@ ], "metadata": { "cellView": "form", - "id": "JsP1TGg2F1g3", - "outputId": "1223a04e-0038-4031-8d2f-26f2be92fb45", - "colab": { - "base_uri": "https://localhost:8080/" - } + "id": "JsP1TGg2F1g3" }, - "execution_count": 2, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "/content\n", - "\n", - "7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21\n", - "p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.00GHz (50653),ASM,AES-NI)\n", - "\n", - "\n", - "\n", - "Command Line Error:\n", - "Cannot find archive name\n", - "extraction complete!\n", - "|\n", - "|\n", - "|\n", - "I'm also nice enough to convert your data and also write your dictionaries lmao. You are welcome :)\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "/usr/local/lib/python3.12/dist-packages/pydub/utils.py:300: SyntaxWarning: invalid escape sequence '\\('\n", - " m = re.match('([su]([0-9]{1,2})p?) \\(([0-9]{1,2}) bit\\)$', token)\n", - "/usr/local/lib/python3.12/dist-packages/pydub/utils.py:301: SyntaxWarning: invalid escape sequence '\\('\n", - " m2 = re.match('([su]([0-9]{1,2})p?)( \\(default\\))?$', token)\n", - "/usr/local/lib/python3.12/dist-packages/pydub/utils.py:310: SyntaxWarning: invalid escape sequence '\\('\n", - " elif re.match('(flt)p?( \\(default\\))?$', token):\n", - "/usr/local/lib/python3.12/dist-packages/pydub/utils.py:314: SyntaxWarning: invalid escape sequence '\\('\n", - " elif re.match('(dbl)p?( \\(default\\))?$', token):\n" - ] - } - ] + "execution_count": null, + "outputs": [] }, { "cell_type": "code", @@ -982,28 +897,10 @@ ], "metadata": { "cellView": "form", - "id": "nI3dzDv_Mr9Y", - "outputId": "b08a969d-968e-4ed9-d82f-32b1ed008316", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 245 - } + "id": "nI3dzDv_Mr9Y" }, - "execution_count": 3, - "outputs": [ - { - "output_type": "error", - "ename": "FileNotFoundError", - "evalue": "[Errno 2] No such file or directory: ''", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/tmp/ipython-input-3168009205.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 332\u001b[0m \u001b[0myaml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdump\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbitch_ass_config\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 333\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 334\u001b[0;31m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmakedirs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msave_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexist_ok\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 335\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 336\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/lib/python3.12/os.py\u001b[0m in \u001b[0;36mmakedirs\u001b[0;34m(name, mode, exist_ok)\u001b[0m\n", - "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: ''" - ] - } - ] + "execution_count": null, + "outputs": [] }, { "cell_type": "code", From 399963baf7b3539e12ed363d2c256e1b9c988e14 Mon Sep 17 00:00:00 2001 From: DELTA SYNTH <105579737+deltaVOCALOID09378@users.noreply.github.com> Date: Thu, 19 Mar 2026 06:09:51 +0700 Subject: [PATCH 09/12] =?UTF-8?q?=E0=B8=AA=E0=B8=A3=E0=B9=89=E0=B8=B2?= =?UTF-8?q?=E0=B8=87=E0=B9=82=E0=B8=94=E0=B8=A2=E0=B9=83=E0=B8=8A=E0=B9=89?= =?UTF-8?q?=20Colab?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DiffSinger_colab_notebook.ipynb | 188 +++++++++++++++++++++++++++++--- 1 file changed, 170 insertions(+), 18 deletions(-) diff --git a/DiffSinger_colab_notebook.ipynb b/DiffSinger_colab_notebook.ipynb index e1a4deaba..96d546e75 100644 --- a/DiffSinger_colab_notebook.ipynb +++ b/DiffSinger_colab_notebook.ipynb @@ -75,13 +75,59 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "id": "pK8aicf8A2sj", "cellView": "form", - "collapsed": true + "collapsed": true, + "colab": { + "base_uri": "https://localhost:8080/", + "height": 630 + }, + "outputId": "bcc59e57-72c6-4170-a950-e505e46eef54" }, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "setup complete!\n", + "|\n", + "|\n", + "|\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "" + ] + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {}, + "execution_count": 1 + } + ], "source": [ "from IPython.display import clear_output, Audio, display, HTML\n", "import os\n", @@ -185,11 +231,11 @@ "%cd /content\n", "#@markdown this cell will create a folder name [raw_data] in the root folder of colab (/content) and extract your data into it\n", "\n", - "data_type = \"lab + wav (NNSVS format)\" # @param [\"lab + wav (NNSVS format)\", \"csv + wav (DiffSinger format)\", \"ds (DiffSinger format)\"]\n", + "data_type = \"ds (DiffSinger format)\" # @param [\"lab + wav (NNSVS format)\", \"csv + wav (DiffSinger format)\", \"ds (DiffSinger format)\"]\n", "\n", "#@markdown The path to your data zip file\n", "\n", - "data_zip_path = \"\" #@param {type:\"string\"}\n", + "data_zip_path = \"/content/drive/MyDrive/dataset/Kochujang.zip\" #@param {type:\"string\"}\n", "\n", "#@markdown ___\n", "\n", @@ -209,7 +255,7 @@ " estimate_midi = False\n", " midi_pitch_ext = None\n", "#@markdown Determine how long it will segment your data to based on silence phoneme placement (seconds)\n", - "segment_length = 15 #@param {type:\"slider\", min:5, max:35, step:1}\n", + "segment_length = 12 #@param {type:\"slider\", min:5, max:35, step:1}\n", "\n", "#@markdown Determine how many silence phoneme is allowed in the middle of each segment\n", "max_silence_phoneme_amount = 2 #@param {type:\"slider\", min:0, max:50, step:1}\n", @@ -503,10 +549,65 @@ ], "metadata": { "cellView": "form", - "id": "JsP1TGg2F1g3" + "id": "JsP1TGg2F1g3", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "7ebf686e-957d-43bc-ddb1-40697e7abec9" }, - "execution_count": null, - "outputs": [] + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "/content\n", + "\n", + "7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21\n", + "p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.00GHz (50653),ASM,AES-NI)\n", + "\n", + "Scanning the drive for archives:\n", + " 0M Scan /content/drive/MyDrive/dataset/\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b1 file, 162305254 bytes (155 MiB)\n", + "\n", + "Extracting archive: /content/drive/MyDrive/dataset/Kochujang.zip\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.12/dist-packages/pydub/utils.py:300: SyntaxWarning: invalid escape sequence '\\('\n", + " m = re.match('([su]([0-9]{1,2})p?) \\(([0-9]{1,2}) bit\\)$', token)\n", + "/usr/local/lib/python3.12/dist-packages/pydub/utils.py:301: SyntaxWarning: invalid escape sequence '\\('\n", + " m2 = re.match('([su]([0-9]{1,2})p?)( \\(default\\))?$', token)\n", + "/usr/local/lib/python3.12/dist-packages/pydub/utils.py:310: SyntaxWarning: invalid escape sequence '\\('\n", + " elif re.match('(flt)p?( \\(default\\))?$', token):\n", + "/usr/local/lib/python3.12/dist-packages/pydub/utils.py:314: SyntaxWarning: invalid escape sequence '\\('\n", + " elif re.match('(dbl)p?( \\(default\\))?$', token):\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--\n", + "Path = /content/drive/MyDrive/dataset/Kochujang.zip\n", + "Type = zip\n", + "Physical Size = 162305254\n", + "\n", + " 0%\b\b\b\b \b\b\b\b 8% 2 - JP03.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 17% 5 - JP04.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 26% 8 - TH01.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 31% 8 - TH01.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 40% 11 - TH02.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 45% 14 - TH03.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 49% 14 - TH03.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 57% 17 - TH04.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 59% 17 - TH04.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 66% 21 - EN01.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 69% 21 - EN01.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 74% 24 - EN02.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 81% 27 - EN03.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 88% 30 - EN04.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 95% 36 - JP02.wav\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\bEverything is Ok\n", + "\n", + "Files: 37\n", + "Size: 241625397\n", + "Compressed: 162305254\n", + "extraction complete!\n", + "|\n", + "|\n", + "|\n", + "I'm also nice enough to convert your data and also write your dictionaries lmao. You are welcome :)\n" + ] + } + ] }, { "cell_type": "code", @@ -522,7 +623,7 @@ "%cd /content\n", "clear_output()\n", "#@markdown The model type user is training\n", - "model_type = \"acoustic\" # @param [\"acoustic\", \"variance\"]\n", + "model_type = \"variance\" # @param [\"acoustic\", \"variance\"]\n", "config_cap = model_type.upper()\n", "diffusion_type = \"reflow\" # @param [\"ddpm\", \"reflow\"]\n", "diff_accelerator = \"unipc\" # @param [\"ddim\", \"pndm\", \"dpm-solver\", \"unipc\"]\n", @@ -609,7 +710,7 @@ "precision = \"16-mixed\" # @param [\"32-true\", \"bf16-mixed\", \"16-mixed\", \"bf16\", \"16\"]\n", "\n", "#@markdown User model save path\n", - "save_dir = \"\" #@param {type:\"string\"}\n", + "save_dir = \"/content/drive/MyDrive/dataset\" #@param {type:\"string\"}\n", "\n", "binary_save_dir = save_dir + \"/binary\"\n", "\n", @@ -619,13 +720,13 @@ "\n", "#@markdown Option to use base model for finetuning\n", "\n", - "enable_finetuning = False # @param {type:\"boolean\"}\n", + "enable_finetuning = True # @param {type:\"boolean\"}\n", "\n", "\n", "#@markdown Path to custom base model, leave blank to use [default](https://github.com/haru0l/diffsinger_models) models\n", "#wtf haru i just looked at your readme\"\"\"\"\"\n", "\n", - "base_model_path = \"\" # @param {type:\"string\"}\n", + "base_model_path = \"/content/DiffSinger/checkpoints/nsf_hifigan_44.1k_hop512_128bin_2024.02/model.ckpt\" # @param {type:\"string\"}\n", "\n", "if enable_finetuning:\n", " pretrain = True\n", @@ -676,7 +777,7 @@ "\n", "\n", "#@markdown due to skill issues, if user wish to train with glide embed, please enable it manually in the config\n", - "pitch_training = \"False\" # @param [\"False\", \"True | Standard\", \"True | MelodyEncoder\"]\n", + "pitch_training = \"True | Standard\" # @param [\"False\", \"True | Standard\", \"True | MelodyEncoder\"]\n", "if pitch_training == \"False\":\n", " pitch_training = False\n", " use_melody_encoder = False\n", @@ -696,7 +797,7 @@ "\n", "#@markdown Pitch extractor algorithm\n", "\n", - "f0_ext = \"parselmouth\" # @param [\"parselmouth\", \"rmvpe\", \"harvest\"]\n", + "f0_ext = \"rmvpe\" # @param [\"parselmouth\", \"rmvpe\", \"harvest\"]\n", "if f0_ext == \"rmvpe\":\n", " pe_ckpt_pth = \"checkpoints/rmvpe/model.pt\"\n", "else:\n", @@ -897,10 +998,61 @@ ], "metadata": { "cellView": "form", - "id": "nI3dzDv_Mr9Y" + "id": "nI3dzDv_Mr9Y", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "dc2c2cf6-4827-410d-81b7-0c5377e5670f" }, - "execution_count": null, - "outputs": [] + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "config updated! see below for config's information\n", + "|\n", + "|\n", + "|\n", + "+++---VARIANCE MULTI-SPEAKER TRAINING---+++\n", + "|\n", + "|\n", + "|\n", + "+++---user's settings---+++\n", + "\n", + "\n", + "speaker name: []\n", + "\n", + "\n", + "data augmentation: True\n", + "\n", + "\n", + "pitch extractor: rmvpe\n", + "\n", + "\n", + "binary data save directory: /content/drive/MyDrive/dataset/binary\n", + "\n", + "\n", + "your model will be saved to: /content/drive/MyDrive/dataset\n", + "\n", + "\n", + "==========================================================================================\n", + "\n", + "\n", + "+++---other auto-defined settings---+++\n", + "\n", + "\n", + "dictionary (auto generated): /content/DiffSinger/dictionaries (check this directory)\n", + "\n", + "\n", + "==========================================================================================\n", + "\n", + "\n", + "if you don't like or disagree with any of these options,\n", + "you can go and edit the config at [/content/DiffSinger/configs/variance.yaml]\n" + ] + } + ] }, { "cell_type": "code", From 9a07d7f66f1f7eecadf92ec5fac80aa5a2e701b3 Mon Sep 17 00:00:00 2001 From: DELTA SYNTH <105579737+deltaVOCALOID09378@users.noreply.github.com> Date: Thu, 19 Mar 2026 06:38:01 +0700 Subject: [PATCH 10/12] =?UTF-8?q?=E0=B8=AA=E0=B8=A3=E0=B9=89=E0=B8=B2?= =?UTF-8?q?=E0=B8=87=E0=B9=82=E0=B8=94=E0=B8=A2=E0=B9=83=E0=B8=8A=E0=B9=89?= =?UTF-8?q?=20Colab?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DiffSinger_colab_notebook.ipynb | 58 +++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/DiffSinger_colab_notebook.ipynb b/DiffSinger_colab_notebook.ipynb index 96d546e75..f478fbc14 100644 --- a/DiffSinger_colab_notebook.ipynb +++ b/DiffSinger_colab_notebook.ipynb @@ -1067,10 +1067,62 @@ ], "metadata": { "cellView": "form", - "id": "76NvDR1cXlDM" + "id": "76NvDR1cXlDM", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "8de8d8c2-cef2-4b87-a3cc-e133ac12f1c2" }, - "execution_count": null, - "outputs": [] + "execution_count": 13, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "/content/DiffSinger\n", + "| Hparams chains: ['configs/base.yaml', '/content/DiffSinger/configs/variance.yaml']\n", + "| Hparams: \n", + "\u001b[0;33mK_step\u001b[0m: 1000, \u001b[0;33maccumulate_grad_batches\u001b[0m: 1, \u001b[0;33maudio_sample_rate\u001b[0m: 44100, \u001b[0;33mbase_config\u001b[0m: ['configs/base.yaml'], \u001b[0;33mbinarization_args\u001b[0m: {'num_workers': 0, 'shuffle': True, 'prefer_ds': True}, \n", + "\u001b[0;33mbinarizer_cls\u001b[0m: preprocessing.variance_binarizer.VarianceBinarizer, \u001b[0;33mbinary_data_dir\u001b[0m: /content/drive/MyDrive/dataset/binary, \u001b[0;33mbreathiness_db_max\u001b[0m: -20.0, \u001b[0;33mbreathiness_db_min\u001b[0m: -96.0, \u001b[0;33mbreathiness_smooth_width\u001b[0m: 0.12, \n", + "\u001b[0;33mclip_grad_norm\u001b[0m: 1, \u001b[0;33mdataloader_prefetch_factor\u001b[0m: 2, \u001b[0;33mdataset_size_key\u001b[0m: lengths, \u001b[0;33mdatasets\u001b[0m: [], \u001b[0;33mdictionaries\u001b[0m: {'custom': '/content/DiffSinger/dictionaries/dictionary-custom.txt'}, \n", + "\u001b[0;33mdiff_accelerator\u001b[0m: unipc, \u001b[0;33mdiff_speedup\u001b[0m: 10, \u001b[0;33mdiffusion_type\u001b[0m: reflow, \u001b[0;33mdropout\u001b[0m: 0.1, \u001b[0;33mds_workers\u001b[0m: 4, \n", + "\u001b[0;33mdur_prediction_args\u001b[0m: {'arch': 'fs2', 'dropout': 0.1, 'hidden_size': 512, 'kernel_size': 3, 'lambda_pdur_loss': 0.3, 'lambda_sdur_loss': 3.0, 'lambda_wdur_loss': 1.0, 'log_offset': 1.0, 'loss_type': 'mse', 'num_layers': 5}, \u001b[0;33menc_ffn_kernel_size\u001b[0m: 3, \u001b[0;33menc_layers\u001b[0m: 4, \u001b[0;33menergy_db_max\u001b[0m: -12.0, \u001b[0;33menergy_db_min\u001b[0m: -96.0, \n", + "\u001b[0;33menergy_smooth_width\u001b[0m: 0.12, \u001b[0;33mexp_name\u001b[0m: , \u001b[0;33mextra_phonemes\u001b[0m: [], \u001b[0;33mf0_max\u001b[0m: 1100, \u001b[0;33mf0_min\u001b[0m: 65, \n", + "\u001b[0;33mffn_act\u001b[0m: gelu, \u001b[0;33mfft_size\u001b[0m: 2048, \u001b[0;33mfinetune_ckpt_path\u001b[0m: /content/DiffSinger/checkpoints/nsf_hifigan_44.1k_hop512_128bin_2024.02/model.ckpt, \u001b[0;33mfinetune_enabled\u001b[0m: True, \u001b[0;33mfinetune_ignored_params\u001b[0m: ['model.spk_embed', 'model.fs2.txt_embed', 'model.fs2.encoder.embed_tokens'], \n", + "\u001b[0;33mfinetune_strict_shapes\u001b[0m: False, \u001b[0;33mfreezing_enabled\u001b[0m: False, \u001b[0;33mfrozen_params\u001b[0m: [], \u001b[0;33mglide_embed_scale\u001b[0m: 11.31370849898476, \u001b[0;33mglide_types\u001b[0m: ['up', 'down'], \n", + "\u001b[0;33mhidden_size\u001b[0m: 256, \u001b[0;33mhnsep\u001b[0m: vr, \u001b[0;33mhnsep_ckpt\u001b[0m: checkpoints/vr/model.pt, \u001b[0;33mhop_size\u001b[0m: 512, \u001b[0;33minfer\u001b[0m: False, \n", + "\u001b[0;33mlambda_dur_loss\u001b[0m: 1.0, \u001b[0;33mlambda_pitch_loss\u001b[0m: 1.0, \u001b[0;33mlambda_var_loss\u001b[0m: 1.0, \u001b[0;33mlog_interval\u001b[0m: 100, \u001b[0;33mlr_scheduler_args\u001b[0m: {'gamma': 0.75, 'scheduler_cls': 'torch.optim.lr_scheduler.StepLR', 'step_size': 10000}, \n", + "\u001b[0;33mmain_loss_log_norm\u001b[0m: True, \u001b[0;33mmain_loss_type\u001b[0m: l2, \u001b[0;33mmax_batch_frames\u001b[0m: 80000, \u001b[0;33mmax_batch_size\u001b[0m: 48, \u001b[0;33mmax_beta\u001b[0m: 0.02, \n", + "\u001b[0;33mmax_updates\u001b[0m: 160000, \u001b[0;33mmax_val_batch_frames\u001b[0m: 60000, \u001b[0;33mmax_val_batch_size\u001b[0m: 1, \u001b[0;33mmelody_encoder_args\u001b[0m: {'enc_layers': 4, 'hidden_size': 128}, \u001b[0;33mmerged_phoneme_groups\u001b[0m: [], \n", + "\u001b[0;33mmidi_smooth_width\u001b[0m: 0.06, \u001b[0;33mnccl_p2p\u001b[0m: True, \u001b[0;33mnum_ckpt_keep\u001b[0m: 5, \u001b[0;33mnum_heads\u001b[0m: 2, \u001b[0;33mnum_lang\u001b[0m: 1, \n", + "\u001b[0;33mnum_sanity_val_steps\u001b[0m: 1, \u001b[0;33mnum_spk\u001b[0m: 0, \u001b[0;33mnum_valid_plots\u001b[0m: 10, \u001b[0;33moptimizer_args\u001b[0m: {'beta1': 0.9, 'beta2': 0.98, 'lr': 0.0006, 'optimizer_cls': 'torch.optim.AdamW', 'weight_decay': 0}, \u001b[0;33mpe\u001b[0m: rmvpe, \n", + "\u001b[0;33mpe_ckpt\u001b[0m: checkpoints/rmvpe/model.pt, \u001b[0;33mpermanent_ckpt_interval\u001b[0m: 10000, \u001b[0;33mpermanent_ckpt_start\u001b[0m: 80000, \u001b[0;33mpitch_prediction_args\u001b[0m: {'backbone_args': {'dilation_cycle_length': 5, 'num_channels': 512, 'num_layers': 6}, 'backbone_type': 'lynxnet', 'pitd_clip_max': 12.0, 'pitd_clip_min': -12.0, 'pitd_norm_max': 8.0, 'pitd_norm_min': -8.0, 'repeat_bins': 64}, \u001b[0;33mpl_trainer_accelerator\u001b[0m: auto, \n", + "\u001b[0;33mpl_trainer_devices\u001b[0m: auto, \u001b[0;33mpl_trainer_num_nodes\u001b[0m: 1, \u001b[0;33mpl_trainer_precision\u001b[0m: 16-mixed, \u001b[0;33mpl_trainer_strategy\u001b[0m: {'find_unused_parameters': False, 'name': 'auto', 'process_group_backend': 'nccl'}, \u001b[0;33mpredict_breathiness\u001b[0m: False, \n", + "\u001b[0;33mpredict_dur\u001b[0m: True, \u001b[0;33mpredict_energy\u001b[0m: False, \u001b[0;33mpredict_pitch\u001b[0m: True, \u001b[0;33mpredict_tension\u001b[0m: True, \u001b[0;33mpredict_voicing\u001b[0m: True, \n", + "\u001b[0;33mrel_pos\u001b[0m: True, \u001b[0;33mrope_interleaved\u001b[0m: False, \u001b[0;33msampler_frame_count_grid\u001b[0m: 6, \u001b[0;33msampling_algorithm\u001b[0m: euler, \u001b[0;33msampling_steps\u001b[0m: 20, \n", + "\u001b[0;33mschedule_type\u001b[0m: linear, \u001b[0;33msort_by_len\u001b[0m: True, \u001b[0;33mtask_cls\u001b[0m: training.variance_task.VarianceTask, \u001b[0;33mtension_logit_max\u001b[0m: 10.0, \u001b[0;33mtension_logit_min\u001b[0m: -10.0, \n", + "\u001b[0;33mtension_smooth_width\u001b[0m: 0.06, \u001b[0;33mtime_scale_factor\u001b[0m: 1000, \u001b[0;33mtimesteps\u001b[0m: 1000, \u001b[0;33muse_glide_embed\u001b[0m: False, \u001b[0;33muse_lang_id\u001b[0m: False, \n", + "\u001b[0;33muse_melody_encoder\u001b[0m: False, \u001b[0;33muse_pos_embed\u001b[0m: True, \u001b[0;33muse_rope\u001b[0m: True, \u001b[0;33muse_spk_id\u001b[0m: True, \u001b[0;33mval_check_interval\u001b[0m: 2000, \n", + "\u001b[0;33mvariances_prediction_args\u001b[0m: {'backbone_args': {'dilation_cycle_length': 4, 'num_channels': 384, 'num_layers': 6}, 'backbone_type': 'lynxnet', 'total_repeat_bins': 48}, \u001b[0;33mvoicing_db_max\u001b[0m: -12.0, \u001b[0;33mvoicing_db_min\u001b[0m: -96.0, \u001b[0;33mvoicing_smooth_width\u001b[0m: 0.12, \u001b[0;33mwin_size\u001b[0m: 2048, \n", + "\u001b[0;33mwork_dir\u001b[0m: /content/drive/MyDrive/dataset, \n", + "| Binarizer: \n", + "Traceback (most recent call last):\n", + " File \"/content/DiffSinger/scripts/binarize.py\", line 25, in \n", + " binarize()\n", + " File \"/content/DiffSinger/scripts/binarize.py\", line 21, in binarize\n", + " binarizer_cls().process()\n", + " ^^^^^^^^^^^^^^^\n", + " File \"/content/DiffSinger/preprocessing/variance_binarizer.py\", line 67, in __init__\n", + " super().__init__(data_attrs=VARIANCE_ITEM_ATTRIBUTES)\n", + " File \"/content/DiffSinger/basics/base_binarizer.py\", line 60, in __init__\n", + " self.build_spk_map()\n", + " File \"/content/DiffSinger/basics/base_binarizer.py\", line 85, in build_spk_map\n", + " assert max(spk_ids) < hparams['num_spk'], \\\n", + " ^^^^^^^^^^^^\n", + "ValueError: max() iterable argument is empty\n" + ] + } + ] }, { "cell_type": "markdown", From aebc7c62644dd869325a313c8cc7c4cd9ed4fee5 Mon Sep 17 00:00:00 2001 From: DELTA SYNTH <105579737+deltaVOCALOID09378@users.noreply.github.com> Date: Thu, 19 Mar 2026 10:14:41 +0700 Subject: [PATCH 11/12] Refactor Thai VCCV Phonemizer class structure Update more Signed-off-by: DELTA SYNTH <105579737+deltaVOCALOID09378@users.noreply.github.com> --- .../Thai_VCCV_2025_Phonemizer.cs | 219 +++++++++++++----- 1 file changed, 162 insertions(+), 57 deletions(-) diff --git a/OpenUtau.Plugin.Builtin/Thai_VCCV_2025_Phonemizer.cs b/OpenUtau.Plugin.Builtin/Thai_VCCV_2025_Phonemizer.cs index 853873160..4066983fd 100644 --- a/OpenUtau.Plugin.Builtin/Thai_VCCV_2025_Phonemizer.cs +++ b/OpenUtau.Plugin.Builtin/Thai_VCCV_2025_Phonemizer.cs @@ -1,7 +1,8 @@ -using System; +using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; +using System.Text.RegularExpressions; using Melanchall.DryWetMidi.Interaction; using OpenUtau.Api; using OpenUtau.Classic; @@ -9,26 +10,59 @@ using Serilog; namespace OpenUtau.Plugin.Builtin { - [Phonemizer("Thai2 CVVC&VCCV Phonemizer", "TH2 CVVC&VCCV", " Phonemizer by Ferina, PRINTmov and DELTA SYNTH ", language: "TH")] - public class Thai2 CVVC&VCCV Phonemizer : TH2 Phonemizer { - static readonly string[] vowels = new string[] { + [Phonemizer("Thai VCCV Phonemizer", "TH VCCV", "PRINTmov", language: "TH")] + public class ThaiVCCVPhonemizer : Phonemizer { + + readonly string[] vowels = new string[] { "a", "i", "u", "e", "o", "@", "Q", "3", "6", "1", "ia", "ua", "I", "8" }; - static readonly string[] diphthongs = new string[] { - "r", "l","w", "y" + readonly string[] diphthongs = new string[] { + "r", "l", "w" + }; + + readonly string[] consonants = new string[] { + "b", "ch", "d", "f", "g", "h", "j", "k", "kh", "l", "m", "n", "p", "ph", "r", "s", "t", "th", "w", "y" + }; + + readonly string[] endingConsonants = new string[] { + "b", "ch", "d", "f", "g", "h", "j", "k", "kh", "l", "m", "n", "p", "ph", "r", "s", "t", "th", "w", "y" + }; + + private readonly Dictionary VowelMapping = new Dictionary { + {"เcือะ", "6"}, {"เcือx", "6"}, {"แcะ", "@"}, {"แcx", "@"}, {"เcอะ", "3"}, {"เcอ", "3"}, {"ไc", "I"}, {"ใc", "I"}, {"เcาะ", "Q"}, {"cอx", "Q"}, + {"cืx", "1"}, {"cึx", "1"}, {"cือ", "1"}, {"cะ", "a"}, {"cัx", "a"}, {"cาx", "a"}, {"เcา", "8"}, {"เcะ", "e"}, {"เcx", "e"}, {"cิx", "i"}, {"cีx", "i"}, + {"เcียะ", "ia"}, {"เcียx", "ia"}, {"โcะ", "o"}, {"โcx", "o"}, {"cุx", "u"}, {"cูx", "u"}, {"cัวะ", "ua"}, {"cัว", "ua"}, {"cำ", "am"}, {"เcิx", "3"}, {"เcิ", "3"} }; - static readonly string[] consonants = new string[] { - "b", "ch", "d", "f", "g", "h", "j", "k", "kh", "l", "m", "n", "p", "ph", "r", "s", "t", "th", "w", "y", "-" + private readonly Dictionary CMapping = new Dictionary { + {'-', ""}, {'อ', ""}, // เพิ่ม - และ อ ให้แมปเป็นค่าว่าง เพื่อลากเข้าสระได้เลย + {'ก', "k"}, {'ข', "kh"}, {'ค', "kh"}, {'ฆ', "kh"}, {'ฅ', "kh"}, {'ฃ', "kh"}, + {'จ', "j"}, {'ฉ', "ch"}, {'ช', "ch"}, {'ฌ', "ch"}, + {'ฎ', "d"}, {'ด', "d"}, + {'ต', "t"}, {'ฏ', "t"}, + {'ถ', "th"}, {'ฐ', "th"}, {'ฑ', "th"}, {'ธ', "th"}, {'ท', "th"}, + {'บ', "b"}, {'ป', "p"}, {'พ', "ph"}, {'ผ', "ph"}, {'ภ', "ph"}, {'ฟ', "f"}, {'ฝ', "f"}, + {'ห', "h"}, {'ฮ', "h"}, + {'ม', "m"}, {'น', "n"}, {'ณ', "n"}, {'ร', "r"}, {'ล', "l"}, {'ฤ', "r"}, + {'ส', "s"}, {'ศ', "s"}, {'ษ', "s"}, {'ซ', "s"}, + {'ง', "g"}, {'ย', "y"}, {'ญ', "y"}, {'ว', "w"}, {'ฬ', "r"} }; - static readonly string[] endingConsonants = new string[] { - "b", "ch", "d", "f", "g", "h", "j", "k", "kh", "l", "m", "n", "p", "ph", "r", "s", "t", "th", "w", "y", "-" - }; + private readonly Dictionary XMapping = new Dictionary { + {'บ', "b"}, {'ป', "b"}, {'พ', "b"}, {'ฟ', "b"}, {'ภ', "b"}, + {'ด', "d"}, {'จ', "d"}, {'ช', "d"}, {'ซ', "d"}, {'ฎ', "d"}, {'ฏ', "d"}, {'ฐ', "d"}, + {'ฑ', "d"}, {'ฒ', "d"}, {'ต', "d"}, {'ถ', "d"}, {'ท', "d"}, {'ธ', "d"}, {'ศ', "d"}, {'ษ', "d"}, {'ส', "d"}, + {'ก', "k"}, {'ข', "k"}, {'ค', "k"}, {'ฆ', "k"}, + {'ว', "w"}, + {'ย', "y"}, + {'น', "n"}, {'ญ', "n"}, {'ณ', "n"}, {'ร', "n"}, {'ล', "n"}, {'ฬ', "n"}, + {'ง', "g"}, + {'ม', "m"} + }; private USinger singer; - public override void SetSinger(Your's Singer Name) => this.singer = singer; + public override void SetSinger(USinger singer) => this.singer = singer; private bool checkOtoUntilHit(string[] input, Note note, out UOto oto) { oto = default; @@ -51,7 +85,6 @@ public override Result Process(Note[] notes, Note? prev, Note? next, Note? prevN } var phonemes = new List(); - List tests = new List(); string prevTemp = ""; @@ -59,18 +92,17 @@ public override Result Process(Note[] notes, Note? prev, Note? next, Note? prevN prevTemp = prevNeighbour.Value.lyric; } var prevTh = ParseInput(prevTemp); - var noteTh = ParseInput(currentLyric); if (noteTh.Consonant != null && noteTh.Dipthong == null && noteTh.Vowel != null) { - if (checkOtoUntilHit(new string[] { noteTh.Consonant + noteTh.Dipthong + noteTh.Vowel }, note, out var tempOto)) { + if (checkOtoUntilHit(new string[] { noteTh.Consonant + noteTh.Vowel }, note, out var tempOto)) { tests.Add(tempOto.Alias); } - } else if (noteTh.Consonant != null && noteTh.Dipthong != null && noteTh.Vowel != null && noteTh.endingConsonants != null ) { - if (checkOtoUntilHit(new string[] { noteTh.Consonant + noteTh.Dipthong + noteTh.Vowel + noteTh.endingConsonants}, note, out var tempOto)) { + } else if (noteTh.Consonant != null && noteTh.Dipthong != null && noteTh.Vowel != null) { + if (checkOtoUntilHit(new string[] { noteTh.Consonant + noteTh.Dipthong + noteTh.Vowel }, note, out var tempOto)) { tests.Add(tempOto.Alias); } else { - if (checkOtoUntilHit(new string[] { noteTh.Consonant + noteTh.Dipthong }, note, out var tempOto)) { + if (checkOtoUntilHit(new string[] { noteTh.Consonant + noteTh.Dipthong }, note, out tempOto)) { tests.Add(tempOto.Alias); } if (checkOtoUntilHit(new string[] { noteTh.Dipthong + noteTh.Vowel }, note, out tempOto)) { @@ -95,7 +127,7 @@ public override Result Process(Note[] notes, Note? prev, Note? next, Note? prevN } } else if (nextNeighbour != null && noteTh.Vowel != null) { var nextTh = ParseInput(nextNeighbour.Value.lyric); - if (checkOtoUntilHit(new string[] noteTh.Consonant { " " + nextTh.Vowel }, note, out var tempOto)) { + if (checkOtoUntilHit(new string[] { noteTh.Vowel + " " + nextTh.Consonant }, note, out var tempOto)) { tests.Add(tempOto.Alias); } } @@ -125,55 +157,46 @@ public override Result Process(Note[] notes, Note? prev, Note? next, Note? prevN } if (checkOtoUntilHit(tests.ToArray(), note, out var oto)) { - var noteDuration = notes.Sum(n => n.duration); - for (int i = 0; i < tests.ToArray().Length; i++) { - int position = 0; - int vcPosition = noteDuration - 50; + int vcPosition = noteDuration - 120; - if (nextNeighbour != null && tests[i].Contains(" ")) - { + if (nextNeighbour != null && tests[i].Contains(" ")) { var nextLyric = nextNeighbour.Value.lyric.Normalize(); if (!string.IsNullOrEmpty(nextNeighbour.Value.phoneticHint)) { nextLyric = nextNeighbour.Value.phoneticHint.Normalize(); } var nextTh = ParseInput(nextLyric); var nextCheck = nextTh.Vowel; - if (nextTh.Consonant != null) { - nextCheck = nextTh.Consonant + nextTh.Vowel; - } if (nextTh.Consonant != null) { - nextCheck = nextTh.Consonant + nextTh.Dipthong + nextTh.Vowel; + nextCheck = nextTh.Consonant + nextTh.Vowel; } - if(nextTh.Dipthong != null) { + if (nextTh.Dipthong != null) { nextCheck = nextTh.Consonant + nextTh.Dipthong + nextTh.Vowel; } - var nextAttr = nextNeighbour.Value.phonemeAttributes?.FirstOrDefault(attr => attr.index == 0.9) ?? default; + var nextAttr = nextNeighbour.Value.phonemeAttributes?.FirstOrDefault(attr => attr.index == 0) ?? default; if (singer.TryGetMappedOto(nextCheck, nextNeighbour.Value.tone + nextAttr.toneShift, nextAttr.voiceColor, out var nextOto)) { - if (oto.Overlap > 30) { + if (oto.Overlap > 0) { vcPosition = noteDuration - MsToTick(nextOto.Overlap) - MsToTick(nextOto.Preutter); } } } - - if (noteTh.Dipthong == null || tests.Count <= 1) { + if (noteTh.Dipthong == null || tests.Count <= 2) { if (i == 1) { - position = Math.Max((int)(noteDuration * 0.25), vcPosition); + position = Math.Max((int)(noteDuration * 0.75), vcPosition); } } else { if (i == 1) { - position = Math.Min((int)(noteDuration * 0.15), 60); + position = Math.Min((int)(noteDuration * 0.1), 60); } else if (i == 2) { - position = Math.Max((int)(noteDuration * 0.30), vcPosition); + position = Math.Max((int)(noteDuration * 0.75), vcPosition); } } phonemes.Add(new Phoneme { phoneme = tests[i], position = position }); } - } return new Result { @@ -182,8 +205,10 @@ public override Result Process(Note[] notes, Note? prev, Note? next, Note? prevN } (string Consonant, string Dipthong, string Vowel, string EndingConsonant) ParseInput(string input) { + input = WordToPhonemes(input); + string consonant = null; - string dipthong = null; + string diphthong = null; string vowel = null; string endingConsonant = null; @@ -191,43 +216,123 @@ public override Result Process(Note[] notes, Note? prev, Note? next, Note? prevN return (null, null, null, null); } - if (input.Length > 3) { - foreach (var dip in diphthongs) { - if (input[1].ToString() || input[1].Equals(dip) || input[1].ToString().Equals(dip)) { - dipthong = dip; + foreach (var con in consonants) { + if (input.StartsWith(con)) { + if (consonant == null || consonant.Length < con.Length) { + consonant = con; } } } - if else (input.Length > 4) { - foreach (var dip in diphthongs + endingConsonant) { - if (input[1].ToString() || input[1].Equals(dip) || input[1].ToString() || input[1].ToString() Equals(EC)) { - dipthong = dip endingConsonant = EC; + + int startIdx = consonant?.Length ?? 0; + foreach (var dip in diphthongs) { + if (input.Substring(startIdx).StartsWith(dip)) { + if (diphthong == null || diphthong.Length < dip.Length) { + diphthong = dip; } } } - foreach (var con in consonants) { - if (input.StartsWith(con)) { - if (consonant == null || consonant.Length + con.Length) { - consonant = con; + startIdx += diphthong?.Length ?? 0; + foreach (var vow in vowels) { + if (input.Substring(startIdx).StartsWith(vow)) { + if (vowel == null || vowel.Length < vow.Length) { + vowel = vow; } } + } + + foreach (var con in endingConsonants) { if (input.EndsWith(con)) { - if (endingConsonant == null || endingConsonant.Length + con.Length) { + if (endingConsonant == null || endingConsonant.Length < con.Length) { endingConsonant = con; } } } - foreach (var vow in vowels) { - if (input.Contains(vow)) { - if (vowel == null || vowel.Length = vow.Length) { - vowel = vow; + return (consonant, diphthong, vowel, endingConsonant); + } + + public string WordToPhonemes(string input) { + input = input.Replace(" ", ""); + input = RemoveInvalidLetters(input); + + // เพิ่มการตรวจจับ '-' เพื่อไม่ให้โค้ดข้ามการประมวลผลไป + if (!Regex.IsMatch(input, "[ก-ฮ-]")) { + return input; + } + + foreach (var mapping in VowelMapping) { + // อัปเดต Regex บล็อกพยัญชนะต้น (c) ให้จับเฉพาะคำควบกล้ำที่มีอยู่จริง เพื่อกันปัญหาดึงตัวสะกดไปควบกล้ำ (เช่น แล้ว เป็น แว้) + string pattern = "^" + mapping.Key + .Replace("c", "(ก[รลว]|ข[รลว]|ค[รลว]|ต[รลว]|ป[รล]|พ[รลว]|ฟ[รล]|บ[รล]|ด[ร]|ผล|ทร|ศร|สร|ห[ก-ฮ]|อย|[ก-ฮ-])") + .Replace("x", "([ก-ฮ]?)") + "$"; + + var match = Regex.Match(input, pattern); + if (match.Success) { + string c = match.Groups[1].Value; + string x = match.Groups.Count > 2 ? match.Groups[2].Value : string.Empty; + if (c.Length >= 2 && (c.StartsWith("ห") || c.StartsWith("อ"))) { + c = c.Substring(1); + } + string cConverted = ConvertC(c); + string xConverted = ConvertX(x); + + if (mapping.Value == "a" && input.Contains("ั") && x == "ว") { + return cConverted + "ua"; + } + if (mapping.Value == "e" && x == "ย") { + return cConverted + "3" + xConverted; } + return cConverted + mapping.Value + xConverted; } } + if (input.Length == 1) { + return ConvertC(input) + "Q"; + } else if (input.Length == 2) { + return ConvertC(input[0].ToString()) + "o" + ConvertX(input[1].ToString()); + } else if (input.Length == 3) { + if (input[1] == 'ว') { + return ConvertC(input[0].ToString()) + "ua" + ConvertX(input[2].ToString()); + } else { + return ConvertC(input.Substring(0, 2).ToString()) + "o" + ConvertX(input[1].ToString()); + } + } else if (input.Length == 4) { + if (input[2] == 'ว') { + return ConvertC(input.Substring(0, 2).ToString()) + "ua" + ConvertX(input[3].ToString()); + } + } + return input; + } - return (consonant, dipthong, vowel, endingConsonant); + private string ConvertC(string input) { + if (string.IsNullOrEmpty(input)) return input; + char firstChar = input[0]; + char? secondChar = input.Length > 1 ? input[1] : (char?)null; + if (CMapping.ContainsKey(firstChar)) { + string firstCharConverted = CMapping[firstChar]; + if (secondChar != null && CMapping.ContainsKey((char)secondChar)) { + return firstCharConverted + CMapping[(char)secondChar]; + } + return firstCharConverted; + } + return input; } + + private string ConvertX(string input) { + if (string.IsNullOrEmpty(input)) return input; + char firstChar = input[0]; + if (XMapping.ContainsKey(firstChar)) { + return XMapping[firstChar]; + } + return input; + } + + private string RemoveInvalidLetters(string input) { + input = Regex.Replace(input, ".์", ""); + input = Regex.Replace(input, "[่้๊๋็]", ""); + return input; + } + } } From 7c57f0e5bc9b0e8d6c99eea40f6051486deafc62 Mon Sep 17 00:00:00 2001 From: DELTA SYNTH <105579737+deltaVOCALOID09378@users.noreply.github.com> Date: Thu, 19 Mar 2026 13:40:31 +0700 Subject: [PATCH 12/12] Delete OpenUtau/Colors/Red Theme.axml It's not ready, I think I delete now. Signed-off-by: DELTA SYNTH <105579737+deltaVOCALOID09378@users.noreply.github.com> --- OpenUtau/Colors/Red Theme.axml | 48 ---------------------------------- 1 file changed, 48 deletions(-) delete mode 100644 OpenUtau/Colors/Red Theme.axml diff --git a/OpenUtau/Colors/Red Theme.axml b/OpenUtau/Colors/Red Theme.axml deleted file mode 100644 index 44985b3f1..000000000 --- a/OpenUtau/Colors/Red Theme.axml +++ /dev/null @@ -1,48 +0,0 @@ - - true - - #303030 - #505050 - #707070 - #404040 - - #E0E0E0 - #FCFCFC - #FFFFFF - #A0A0A0 - - #707070 - #B0B0B0 - - #4EA6EA - - #90CAF9 - - #1E88E5 - - #808080 - #A0A0A0 - #4EA6EA - #FF679D - #E62E6E - - #707070 - #D0D0D0 - #D0D0D0 - #404040 - - - #CC2A63 - #FF347C - #FFFFFF - - #CCA5B0 - #FFCEDC - #FF347C - - Transparent - Transparent - #FFFFFF -