From b38766c817be44f76185a6d17bf4c38de3a748e4 Mon Sep 17 00:00:00 2001 From: rokujyushi Date: Sun, 11 Jan 2026 11:05:56 +0900 Subject: [PATCH 1/4] Enhanced Slur Expression and Added F0 Correction Processing in VOICEVOX We have enhanced slur (vowel extension note) support and added F0 (pitch) correction processing in the VOICEVOX renderer. We reorganized the logic in `PhraseToVoicevoxSynthParams`, revised the arguments and exception handling in `BuildVNotes`, added slur length addition processing in `NoteGroupsToVQuery`, newly implemented the `AdjustF0ForSlur` method, and strengthened the robustness of `getBaseSingerID`. This results in more natural and stable slur expression in OpenUtau. --- OpenUtau.Core/Voicevox/VoicevoxRenderer.cs | 182 ++++++++++++--------- OpenUtau.Core/Voicevox/VoicevoxUtils.cs | 143 ++++++++++++++-- 2 files changed, 230 insertions(+), 95 deletions(-) diff --git a/OpenUtau.Core/Voicevox/VoicevoxRenderer.cs b/OpenUtau.Core/Voicevox/VoicevoxRenderer.cs index 3f78878f7..e6549b6b6 100644 --- a/OpenUtau.Core/Voicevox/VoicevoxRenderer.cs +++ b/OpenUtau.Core/Voicevox/VoicevoxRenderer.cs @@ -4,6 +4,7 @@ using System.Linq; using System.Threading; using System.Threading.Tasks; +using K4os.Hash.xxHash; using NAudio.Wave; using Newtonsoft.Json; using Newtonsoft.Json.Linq; @@ -11,6 +12,7 @@ using OpenUtau.Core.Render; using OpenUtau.Core.Ustx; using Serilog; +using SharpCompress; using ThirdParty; /* @@ -65,6 +67,7 @@ public Task Render(RenderPhrase phrase, Progress progress, int tra } string progressInfo = $"Track {trackNo + 1}: {this} \"{string.Join(" ", phrase.phones.Select(p => p.phoneme))}\""; progress.Complete(0, progressInfo); + //ulong hash = HashPhraseGroups(phrase); var wavPath = Path.Join(PathManager.Inst.CachePath, $"vv-{phrase.hash:x16}.wav"); phrase.AddCacheFile(wavPath); var result = Layout(phrase); @@ -75,7 +78,8 @@ public Task Render(RenderPhrase phrase, Progress progress, int tra VoicevoxUtils.Loaddic(singer); } try { - VoicevoxSynthParams vsParams = PhraseToVoicevoxSynthParams(phrase, phrase.singer as VoicevoxSinger); + Log.Information($"Starting Voicevox synthesis"); + VoicevoxSynthParams vsParams = PhraseToVoicevoxSynthParams(phrase, phrase.singer as VoicevoxSinger, false); int vvTotalFrames = 0; double frameMs = (1000d / VoicevoxUtils.fps); @@ -106,18 +110,21 @@ public Task Render(RenderPhrase phrase, Progress progress, int tra result.positionMs = phrase.positionMs - phrase.timeAxis.TickPosToMsPos((vsParams.phonemes.First().frame_length / VoicevoxUtils.fps) * 1000d); } - int speaker = 0; + int speakerID = 0; singer.voicevoxConfig.styles.ForEach(style => { if (style.name.Equals(phrase.singer.Subbanks[1].Suffix) && style.type.Equals("frame_decode")) { - speaker = style.id; + speakerID = style.id; } + // Apply the voice color setting value if (style.name.Equals(phrase.phones[0].suffix) && style.type.Equals("frame_decode")) { - speaker = style.id; - } else if ((style.name + "_" + style.type).Equals(phrase.phones[0].suffix)) { - speaker = style.id; + speakerID = style.id; + } else // Supports styles with the same name but different types + if ((style.name + "_" + style.type).Equals(phrase.phones[0].suffix)) { + speakerID = style.id; } }); - var queryurl = new VoicevoxURL() { method = "POST", path = "/frame_synthesis", query = new Dictionary { { "speaker", speaker.ToString() } }, body = JsonConvert.SerializeObject(vsParams), accept = "audio/wav" }; + VoicevoxUtils.InitializedSpeaker(speakerID.ToString(), false); + var queryurl = new VoicevoxURL() { method = "POST", path = "/frame_synthesis", query = new Dictionary { { "speaker", speakerID.ToString() } }, body = JsonConvert.SerializeObject(vsParams), accept = "audio/wav" }; var response = VoicevoxClient.Inst.SendRequest(queryurl); byte[] bytes = null; if (!response.Item2.Equals(null)) { @@ -132,7 +139,12 @@ public Task Render(RenderPhrase phrase, Progress progress, int tra File.WriteAllBytes(wavPath, bytes); } } catch (Exception e) { - Log.Error(e, "Failed to create a voice base."); + if (e is VoicevoxException) { + Log.Error($"Failed to create the audio."); + throw new MessageCustomizableException("Failed to create the audio.", "Failed to create the audio.", e); + } else { + Log.Error($"Failed to create the audio.{e}"); + } } if (cancellation.IsCancellationRequested) { return new RenderResult(); @@ -155,78 +167,25 @@ public Task Render(RenderPhrase phrase, Progress progress, int tra return task; } - private VoicevoxSynthParams PhraseToVoicevoxSynthParams(RenderPhrase phrase, VoicevoxSinger singer) { + private VoicevoxSynthParams PhraseToVoicevoxSynthParams(RenderPhrase phrase, VoicevoxSinger singer, bool pitch_slur) { - VoicevoxSynthParams vsParams = new VoicevoxSynthParams(); //Prepare for future additions of Teacher Singer. string baseSingerID = VoicevoxUtils.getBaseSingerID(singer); + VoicevoxUtils.InitializedSpeaker(baseSingerID, true); + List vnotes_1 = BuildVNotes(phrase, baseSingerID); - if (phrase.phones.All(p => VoicevoxUtils.IsDicKana(p.phoneme) || VoicevoxUtils.IsDicPau(p.phoneme))) { - // TODO: slur support - List vnotes = new List(); - //if (slur) { - for (int i = 0; i < phrase.phones.Length; i++) { - vnotes.Add(new VoicevoxNote() { - lyric = phrase.phones[i].phoneme, - positionMs = phrase.phones[i].positionMs, - durationMs = phrase.phones[i].durationMs, - tone = (int)(phrase.phones[i].tone + phrase.phones[i].toneShift) - }); - } - //} else { - //} - - VoicevoxQueryMain vqMain = VoicevoxUtils.NoteGroupsToVQuery(vnotes.ToArray(), phrase.timeAxis); - - vsParams = VoicevoxUtils.VoicevoxVoiceBase(vqMain, baseSingerID); - } else if (phrase.phones.All(p => VoicevoxUtils.IsVowel(p.phoneme) || VoicevoxUtils.IsConsonant(p.phoneme))) { - List vnotes = new List(); - for (int i = 0; i < phrase.notes.Length; i++) { - var durationMs = phrase.notes[i].durationMs; - var currentLyric = phrase.notes[i].lyric.Normalize(); - var lyricList = currentLyric.Split(" "); - var shiftTone = phrase.phones[0].toneShift; - if (lyricList.Length > 1) { - currentLyric = lyricList[1]; - } - if (!VoicevoxUtils.IsSyllableVowelExtensionNote(currentLyric)) { - if (VoicevoxUtils.IsDicPau(currentLyric)) { - currentLyric = string.Empty; - } else if (VoicevoxUtils.dic.IsDic(currentLyric)) { - currentLyric = VoicevoxUtils.dic.Lyrictodic(currentLyric); - } else if (!VoicevoxUtils.IsDicKana(currentLyric)) { - currentLyric = string.Empty; - } - } else if (vnotes.Count >= i - 1 && 0 <= i - 1) { - // TODO: slur support - //if (slur) { - var tempNote = vnotes[vnotes.Count - 1]; - tempNote.durationMs += durationMs; - vnotes[vnotes.Count - 1] = tempNote; - continue; - //} else { - // if (VoicevoxUtils.phoneme_List.kanas.TryGetValue(vnotes[i - 1].lyric, out string str)) { - // currentLyric = str; - // } - //} - } - vnotes.Add(new VoicevoxNote() { - lyric = currentLyric, - positionMs = phrase.notes[i].positionMs, - durationMs = durationMs, - tone = (int)(phrase.notes[i].tone + shiftTone) - }); - } - //Match the phonemes in the synthesis parameters to the scores in the score to update F0 and volume - //Create parameters for the update source. - VoicevoxQueryMain vqMain = VoicevoxUtils.NoteGroupsToVQuery(vnotes.ToArray(), phrase.timeAxis); - VoicevoxSynthParams vsParams_1 = VoicevoxUtils.VoicevoxVoiceBase(vqMain, baseSingerID); + //Match the phonemes in the synthesis parameters to the scores in the score to update F0 and volume + //Create parameters for the update source. + VoicevoxQueryMain vqMain_1 = VoicevoxUtils.NoteGroupsToVQuery(vnotes_1.ToArray(), phrase.timeAxis); + VoicevoxSynthParams vsParams = new VoicevoxSynthParams(); + if (IsPhonemeNoteCountMatch(phrase)) { + vsParams = VoicevoxUtils.VoicevoxVoiceBase(vqMain_1, baseSingerID); + } else { + VoicevoxSynthParams vsParams_1 = VoicevoxUtils.VoicevoxVoiceBase(vqMain_1, baseSingerID); //Create parameters for the update destination. vsParams = PhonemeToVoicevoxSynthParams(phrase); VoicevoxSynthParams vsParams_2 = vsParams.Clone(); - - if (vsParams.phonemes.Count == vsParams_1.phonemes.Count) { for (int i = 0; i < vsParams_1.phonemes.Count; i++) { // TODO: Develop a VOICEVOX engine dedicated to OpenUtau so that synthesis parameters are updated when phonemes are changed. @@ -239,8 +198,8 @@ private VoicevoxSynthParams PhraseToVoicevoxSynthParams(RenderPhrase phrase, Voi } } //Update F0 and volume - vsParams.f0 = VoicevoxUtils.QueryToF0(vqMain, vsParams, baseSingerID); - vsParams.volume = VoicevoxUtils.QueryToVolume(vqMain, vsParams, baseSingerID); + vsParams.f0 = VoicevoxUtils.QueryToF0(vqMain_1, vsParams, baseSingerID); + vsParams.volume = VoicevoxUtils.QueryToVolume(vqMain_1, vsParams, baseSingerID); //Update phoneme for (int i = 0; i < vsParams_2.phonemes.Count; i++) { //var flag = phrase.phones[i].flags.FirstOrDefault(f => f.Item1 == VoicevoxUtils.REPM); @@ -250,12 +209,63 @@ private VoicevoxSynthParams PhraseToVoicevoxSynthParams(RenderPhrase phrase, Voi // } //} } - } else { + } + if (pitch_slur) { + VoicevoxUtils.AdjustF0ForSlur(VoicevoxUtils.NoteGroupsToVQuery(vnotes_1.ToArray(), phrase.timeAxis, true), vsParams.f0); + } + return vsParams; + } + + private List BuildVNotes(RenderPhrase phrase, string baseSingerID) { + List vnotes = new List(); + try { + //if (IsPhonemeNoteCountMatch(phrase)) { + // for (int i = 0; i < phrase.phones.Length; i++) { + // vnotes.Add(new VoicevoxNote() { + // lyric = phrase.phones[i].phoneme, + // positionMs = phrase.phones[i].positionMs, + // durationMs = phrase.phones[i].durationMs, + // tone = (int)(phrase.phones[i].tone + phrase.phones[i].toneShift) + // }); + // } + //} else { + for (int i = 0; i < phrase.notes.Length; i++) { + var durationMs = phrase.notes[i].durationMs; + var currentLyric = phrase.notes[i].lyric.Normalize(); + var lyricList = currentLyric.Split(" "); + var shiftTone = phrase.phones[0].toneShift; + foreach (string lyric in lyricList) { + if (!VoicevoxUtils.IsSyllableVowelExtensionNote(lyric)) { + if (VoicevoxUtils.IsPau(lyric)) { + currentLyric = string.Empty; + } else if (VoicevoxUtils.dic.IsDic(lyric)) { + currentLyric = VoicevoxUtils.dic.Lyrictodic(lyric); + } else if (VoicevoxUtils.phoneme_List.kanas.ContainsKey(lyric)) { + currentLyric = lyric; + } else { + currentLyric = string.Empty; + } + } + } + vnotes.Add(new VoicevoxNote() { + lyric = currentLyric, + positionMs = phrase.notes[i].positionMs, + durationMs = durationMs, + tone = (int)(phrase.notes[i].tone + shiftTone) + }); + } + //} + } catch (Exception e) { throw new MessageCustomizableException( $"Failed to create a voice base. The phoneme is not supported by the VOICEVOX engine.\n{string.Join(" ", phrase.phones.Select(p => p.phoneme))}", $"You are confusing phonemes and hiragana.\n{string.Join(" ", phrase.phones.Select(p => p.phoneme))}", new VoicevoxException()); } - return vsParams; + + return vnotes; + } + + private bool IsPhonemeNoteCountMatch(RenderPhrase phrase) { + return phrase.phones.Length == phrase.notes.Where(note => !VoicevoxUtils.IsSyllableVowelExtensionNote(note.lyric)).Count() && phrase.phones.All(p => VoicevoxUtils.phoneme_List.kanas.ContainsKey(p.phoneme)); } private VoicevoxSynthParams PhonemeToVoicevoxSynthParams(RenderPhrase phrase) { @@ -407,7 +417,7 @@ RenderPitchResult IRenderer.LoadRenderedPitch(RenderPhrase phrase) { if (singer != null) { string baseSingerID = VoicevoxUtils.getBaseSingerID(singer); - VoicevoxSynthParams vsParams = PhraseToVoicevoxSynthParams(phrase, phrase.singer as VoicevoxSinger/*, true*/); + VoicevoxSynthParams vsParams = PhraseToVoicevoxSynthParams(phrase, phrase.singer as VoicevoxSinger, true); double frameMs = (1000d / VoicevoxUtils.fps); int vvTotalFrames = 0; vsParams.phonemes.ForEach(x => vvTotalFrames += x.frame_length); @@ -441,5 +451,23 @@ RenderPitchResult IRenderer.LoadRenderedPitch(RenderPhrase phrase) { } return null; } + + ulong HashPhraseGroups(RenderPhrase phrase) { + using (var stream = new MemoryStream()) { + using (var writer = new BinaryWriter(stream)) { + writer.Write(phrase.preEffectHash); + writer.Write(phrase.phones[0].tone); + writer.Write(phrase.phones[0].direct); + if (phrase.phones[0].direct) { + writer.Write(phrase.phones[0].toneShift); + } else { + phrase.phones.ForEach(x => writer.Write(x.toneShift)); + } + writer.Write(phrase.phones[0].volume); + return XXH64.DigestOf(stream.ToArray()); + } + } + } + } } diff --git a/OpenUtau.Core/Voicevox/VoicevoxUtils.cs b/OpenUtau.Core/Voicevox/VoicevoxUtils.cs index bcd57443e..e3abf4b21 100644 --- a/OpenUtau.Core/Voicevox/VoicevoxUtils.cs +++ b/OpenUtau.Core/Voicevox/VoicevoxUtils.cs @@ -62,8 +62,13 @@ public class VoicevoxQueryNotes { public int? key; public int frame_length; public string lyric; + // Index for Phonemizer public int vqnindex; - + // Index for Slur + public int slur_index; + public override string ToString() { + return $"vqnindex:{vqnindex}: lyric:{lyric}, key:{key}, frame_length:{frame_length}"; + } } public class VoicevoxQueryMain { @@ -180,13 +185,52 @@ public static class VoicevoxUtils { public static Dictionary_list dic = new Dictionary_list(); public static Phoneme_list phoneme_List = new Phoneme_list(); - public static VoicevoxSynthParams VoicevoxVoiceBase(VoicevoxQueryMain qNotes, string id) { - var queryurl = new VoicevoxURL() { method = "POST", path = "/sing_frame_audio_query", query = new Dictionary { { "speaker", id } }, body = JsonConvert.SerializeObject(qNotes) }; + private static bool TryParseJson(string json, out JToken token) { + try { + token = JToken.Parse(json); + return true; + } catch (JsonReaderException ex) { + //Log.Error($"Invalid JSON: {ex.Message}"); + token = null; + return false; + } + } + + public static bool InitializedSpeaker(string id, bool skipReinit = false) { + var queryurl = new VoicevoxURL() { method = "GET", path = "/is_initialized_speaker", query = new Dictionary { { "speaker", id } } }; + var response = VoicevoxClient.Inst.SendRequest(queryurl); + var jObj = JObject.Parse(response.Item1); + if (jObj.ContainsKey("detail")) { + Log.Error($"Response was incorrect. : {jObj}"); + return false; + } else if (jObj.ContainsKey("json")) { + if (jObj.TryGetValue("json", out var jsonToken)) { + if (!jsonToken.Value()) { + queryurl = new VoicevoxURL() { method = "POST", path = "/initialize_speaker", query = new Dictionary { { "speaker", id }, { "skip_reinit", skipReinit.ToString() } } }; + response = VoicevoxClient.Inst.SendRequest(queryurl); + if (TryParseJson(response.Item1, out var token)) { + if (jObj.ContainsKey("detail")) { + Log.Error($"Response was incorrect. : {jObj}"); + } + return false; + } + return true; + } else { + return true; + } + } + } + return false; + } + + public static VoicevoxSynthParams VoicevoxVoiceBase(VoicevoxQueryMain vqMain, string id) { + var queryurl = new VoicevoxURL() { method = "POST", path = "/sing_frame_audio_query", query = new Dictionary { { "speaker", id } }, body = JsonConvert.SerializeObject(vqMain) }; var response = VoicevoxClient.Inst.SendRequest(queryurl); VoicevoxSynthParams vvNotes; var jObj = JObject.Parse(response.Item1); if (jObj.ContainsKey("detail")) { Log.Error($"Response was incorrect. : {jObj}"); + throw new VoicevoxException($"Response was incorrect. : \n{jObj}\nScore:{string.Join(" ", vqMain.notes.Select(n => n.lyric))}"); } else { vvNotes = jObj.ToObject(); return vvNotes; @@ -198,7 +242,7 @@ public static void Loaddic(VoicevoxSinger singer) { dic.Loaddic(singer.Location); } - public static VoicevoxQueryMain NoteGroupsToVQuery(VoicevoxNote[] vNotes, TimeAxis timeAxis) { + public static VoicevoxQueryMain NoteGroupsToVQuery(VoicevoxNote[] vNotes, TimeAxis timeAxis, bool pitch_slur = false) { VoicevoxQueryMain vqMain = new VoicevoxQueryMain(); int index = 0; try { @@ -209,15 +253,32 @@ public static VoicevoxQueryMain NoteGroupsToVQuery(VoicevoxNote[] vNotes, TimeAx vqnindex = -1 }); int short_length_count = 0; + int slur_index = 0; while (index < vNotes.Length) { string lyric = dic.Notetodic(vNotes, index); - //Avoid synthesis without at least two frames. double durationMs = vNotes[index].durationMs; + // TODO: slur support + if (IsSyllableVowelExtensionNote(vNotes[index].lyric)) { + // ピッチ生成でスラーを考慮する場合、母音伸ばしノートは前のノートのカナを引き継ぐ + if (pitch_slur) { + if (vNotes.Length >= index - 1 && 0 <= index - 1) { + if (VoicevoxUtils.phoneme_List.kanas.TryGetValue(vNotes[index - 1].lyric, out string str)) { + lyric = str; + slur_index++; + } + } else { + slur_index = 0; + } + } + } int length = (int)Math.Round((durationMs / 1000f) * VoicevoxUtils.fps, MidpointRounding.AwayFromZero); + //Avoid synthesis without at least two frames. if (length < 2) { length = 2; } + if (durationMs > (length / VoicevoxUtils.fps) * 1000f) { + // If the note length is longer than the rounded length, increase the length by one. if (short_length_count >= 2) { length += 1; short_length_count = 0; @@ -225,17 +286,28 @@ public static VoicevoxQueryMain NoteGroupsToVQuery(VoicevoxNote[] vNotes, TimeAx short_length_count += 1; } } + if (IsSyllableVowelExtensionNote(vNotes[index].lyric)) { + if (!pitch_slur) { + //通常合成ではスラー分の長さを前のノートに足す + vqMain.notes[index].frame_length += length; + break; + } + } + + //Set tone to null if lyric is empty int? tone = null; if (!string.IsNullOrEmpty(lyric)) { tone = vNotes[index].tone; } else { + // 明示的に空文字を設定する。 lyric = ""; } vqMain.notes.Add(new VoicevoxQueryNotes { lyric = lyric, frame_length = length, key = tone, - vqnindex = index + vqnindex = index, + slur_index = slur_index }); index++; } @@ -253,13 +325,14 @@ public static VoicevoxQueryMain NoteGroupsToVQuery(VoicevoxNote[] vNotes, TimeAx } public static List QueryToF0(VoicevoxQueryMain vqMain, VoicevoxSynthParams vsParams, string id) { - VoicevoxQueryParams vqParams = new VoicevoxQueryParams() { score = vqMain, frame_audio_query = vsParams }; + VoicevoxQueryParams vqParams = new VoicevoxQueryParams() { score = vqMain, frame_audio_query = vsParams }; var queryurl = new VoicevoxURL() { method = "POST", path = "/sing_frame_f0", query = new Dictionary { { "speaker", id } }, body = JsonConvert.SerializeObject(vqParams) }; var response = VoicevoxClient.Inst.SendRequest(queryurl); List f0s = new List(); var jObj = JObject.Parse(response.Item1); if (jObj.ContainsKey("detail")) { Log.Error($"Response was incorrect. : {jObj}"); + throw new VoicevoxException($"Response was incorrect. : \n{jObj}\nScore:{string.Join(" ", vqMain.notes.Select(n => n.lyric))}"); } else { f0s = jObj["json"].ToObject>(); } @@ -274,12 +347,45 @@ public static List QueryToVolume(VoicevoxQueryMain vqMain, VoicevoxSynth var jObj = JObject.Parse(response.Item1); if (jObj.ContainsKey("detail")) { Log.Error($"Response was incorrect. : {jObj}"); + throw new VoicevoxException($"Response was incorrect. : \n{jObj}\nScore:{string.Join(" ", vqMain.notes.Select(n => n.lyric))}"); } else { volumes = jObj["json"].ToObject>(); } return volumes; } + public static void AdjustF0ForSlur(VoicevoxQueryMain vqMain, List f0) { + if (vqMain == null || vqMain.notes == null || f0 == null) { + return; + } + + int offset = 0; + int? baseKey = null; + foreach (var note in vqMain.notes) { + int start = offset; + int end = Math.Min(f0.Count, offset + note.frame_length); + + if (note.key.HasValue) { + if (note.slur_index == 0) { + baseKey = note.key; + } else if (note.slur_index > 0 && baseKey.HasValue) { + int delta = note.key.Value - baseKey.Value; + if (delta != 0) { + double factor = Math.Pow(2d, delta / 12d); + for (int i = start; i < end; i++) { + f0[i] *= factor; + } + } + } + } + + offset += note.frame_length; + if (offset >= f0.Count) { + break; + } + } + } + public static double[] SampleCurve(RenderPhrase phrase, float[] curve, double defaultValue, double frameMs, int length, int headFrames, int tailFrames, double offset, Func convert) { const int interval = 5; var result = new double[length]; @@ -307,15 +413,12 @@ public static double[] SampleCurve(RenderPhrase phrase, float[] curve, double de } return result; } - - public static bool IsDicKana(string s) { + public static bool IsKana(string s) { return phoneme_List.kanas.ContainsKey(s); } - - public static bool IsDicPau(string s) { + public static bool IsPau(string s) { return phoneme_List.paus.ContainsKey(s); } - public static bool IsVowel(string s) { return phoneme_List.vowels.Contains(s); } @@ -329,12 +432,16 @@ public static bool TryGetPau(string s, out string str) { } public static string getBaseSingerID(VoicevoxSinger singer) { - if (singer.voicevoxConfig.base_singer_style != null) { - foreach (var s in singer.voicevoxConfig.base_singer_style) { - if (s.name.Equals(singer.voicevoxConfig.base_singer_name)) { - if (s.styles.name.Equals(singer.voicevoxConfig.base_singer_style_name)) { - return s.styles.id.ToString(); - } + if (singer.voicevoxConfig == null) { + return defaultID; + } + if (singer.voicevoxConfig.base_singer_style == null) { + return defaultID; + } + foreach (var s in singer.voicevoxConfig.base_singer_style) { + if (s.name.Equals(singer.voicevoxConfig.base_singer_name)) { + if (s.styles.name.Equals(singer.voicevoxConfig.base_singer_style_name)) { + return s.styles.id.ToString(); } } } From d0ee4b314aadafed256d01d36c8c514a058c8c37 Mon Sep 17 00:00:00 2001 From: rokujyushi Date: Sat, 24 Jan 2026 18:50:03 +0900 Subject: [PATCH 2/4] Improved cache parameters for VOICEVOX synthesis process Include both phrase.hash and HashPhraseGroups in VoicevoxRenderer's cache file name to prevent conflicts; reorganize parameter generation logic in PhaseToVoicevoxSynthParams, unify variable names and refine error messages. BuildVNotes improves lyrics handling and exception messages; InitializedSpeaker in VoicevoxUtils simplifies JSON parsing and organizes response decisions; NoteGroupsToVQuery clarifies slur processing; and NoteGroupsToVQuery improves the way to handle slurs. Simplified conditional expressions in getBaseSingerID. --- OpenUtau.Core/Voicevox/VoicevoxRenderer.cs | 78 +++++++++----------- OpenUtau.Core/Voicevox/VoicevoxUtils.cs | 85 ++++++++++------------ 2 files changed, 72 insertions(+), 91 deletions(-) diff --git a/OpenUtau.Core/Voicevox/VoicevoxRenderer.cs b/OpenUtau.Core/Voicevox/VoicevoxRenderer.cs index e6549b6b6..c2ca70b85 100644 --- a/OpenUtau.Core/Voicevox/VoicevoxRenderer.cs +++ b/OpenUtau.Core/Voicevox/VoicevoxRenderer.cs @@ -67,8 +67,8 @@ public Task Render(RenderPhrase phrase, Progress progress, int tra } string progressInfo = $"Track {trackNo + 1}: {this} \"{string.Join(" ", phrase.phones.Select(p => p.phoneme))}\""; progress.Complete(0, progressInfo); - //ulong hash = HashPhraseGroups(phrase); - var wavPath = Path.Join(PathManager.Inst.CachePath, $"vv-{phrase.hash:x16}.wav"); + ulong hash = HashPhraseGroups(phrase); + var wavPath = Path.Join(PathManager.Inst.CachePath, $"vv-{phrase.hash:x16}_{hash:x8}.wav"); phrase.AddCacheFile(wavPath); var result = Layout(phrase); if (!File.Exists(wavPath)) { @@ -140,7 +140,6 @@ public Task Render(RenderPhrase phrase, Progress progress, int tra } } catch (Exception e) { if (e is VoicevoxException) { - Log.Error($"Failed to create the audio."); throw new MessageCustomizableException("Failed to create the audio.", "Failed to create the audio.", e); } else { Log.Error($"Failed to create the audio.{e}"); @@ -172,96 +171,87 @@ private VoicevoxSynthParams PhraseToVoicevoxSynthParams(RenderPhrase phrase, Voi //Prepare for future additions of Teacher Singer. string baseSingerID = VoicevoxUtils.getBaseSingerID(singer); VoicevoxUtils.InitializedSpeaker(baseSingerID, true); - List vnotes_1 = BuildVNotes(phrase, baseSingerID); + List vNotes = BuildVNotes(phrase, baseSingerID); //Match the phonemes in the synthesis parameters to the scores in the score to update F0 and volume //Create parameters for the update source. - VoicevoxQueryMain vqMain_1 = VoicevoxUtils.NoteGroupsToVQuery(vnotes_1.ToArray(), phrase.timeAxis); - VoicevoxSynthParams vsParams = new VoicevoxSynthParams(); + VoicevoxQueryMain vqMain = VoicevoxUtils.NoteGroupsToVQuery(vNotes.ToArray(), phrase.timeAxis); + VoicevoxSynthParams vsParams; if (IsPhonemeNoteCountMatch(phrase)) { - vsParams = VoicevoxUtils.VoicevoxVoiceBase(vqMain_1, baseSingerID); + vsParams = VoicevoxUtils.VoicevoxVoiceBase(vqMain, baseSingerID); } else { - VoicevoxSynthParams vsParams_1 = VoicevoxUtils.VoicevoxVoiceBase(vqMain_1, baseSingerID); + //vsParamsServer is a parameter to hold phonemes generated from note lyrics + VoicevoxSynthParams vsParamsServer = VoicevoxUtils.VoicevoxVoiceBase(vqMain, baseSingerID); //Create parameters for the update destination. vsParams = PhonemeToVoicevoxSynthParams(phrase); - VoicevoxSynthParams vsParams_2 = vsParams.Clone(); - if (vsParams.phonemes.Count == vsParams_1.phonemes.Count) { - for (int i = 0; i < vsParams_1.phonemes.Count; i++) { + //vsParamsUser is a parameter to hold phonemes changed by the user + VoicevoxSynthParams vsParamsUser = vsParams.Clone(); + if (vsParams.phonemes.Count == vsParamsServer.phonemes.Count) { + for (int i = 0; i < vsParamsServer.phonemes.Count; i++) { // TODO: Develop a VOICEVOX engine dedicated to OpenUtau so that synthesis parameters are updated when phonemes are changed. //var flag = phrase.phones[i].flags.FirstOrDefault(f => f.Item1 == VoicevoxUtils.REPM); //if (flag != null) { // if (flag.Item3.Equals(VoicevoxUtils.REPLACE)) { - vsParams.phonemes[i].phoneme = vsParams_1.phonemes[i].phoneme; + vsParams.phonemes[i].phoneme = vsParamsServer.phonemes[i].phoneme; // } //} } } //Update F0 and volume - vsParams.f0 = VoicevoxUtils.QueryToF0(vqMain_1, vsParams, baseSingerID); - vsParams.volume = VoicevoxUtils.QueryToVolume(vqMain_1, vsParams, baseSingerID); + vsParams.f0 = VoicevoxUtils.QueryToF0(vqMain, vsParams, baseSingerID); + vsParams.volume = VoicevoxUtils.QueryToVolume(vqMain, vsParams, baseSingerID); //Update phoneme - for (int i = 0; i < vsParams_2.phonemes.Count; i++) { + for (int i = 0; i < vsParamsUser.phonemes.Count; i++) { //var flag = phrase.phones[i].flags.FirstOrDefault(f => f.Item1 == VoicevoxUtils.REPM); //if (flag != null) { // if (flag.Item3.Equals(VoicevoxUtils.REPLACE)) { - vsParams.phonemes[i].phoneme = vsParams_2.phonemes[i].phoneme; + vsParams.phonemes[i].phoneme = vsParamsUser.phonemes[i].phoneme; // } //} } } if (pitch_slur) { - VoicevoxUtils.AdjustF0ForSlur(VoicevoxUtils.NoteGroupsToVQuery(vnotes_1.ToArray(), phrase.timeAxis, true), vsParams.f0); + VoicevoxUtils.AdjustF0ForSlur(VoicevoxUtils.NoteGroupsToVQuery(vNotes.ToArray(), phrase.timeAxis, true), vsParams.f0); } return vsParams; } private List BuildVNotes(RenderPhrase phrase, string baseSingerID) { - List vnotes = new List(); + List vNotes = new List(); try { - //if (IsPhonemeNoteCountMatch(phrase)) { - // for (int i = 0; i < phrase.phones.Length; i++) { - // vnotes.Add(new VoicevoxNote() { - // lyric = phrase.phones[i].phoneme, - // positionMs = phrase.phones[i].positionMs, - // durationMs = phrase.phones[i].durationMs, - // tone = (int)(phrase.phones[i].tone + phrase.phones[i].toneShift) - // }); - // } - //} else { for (int i = 0; i < phrase.notes.Length; i++) { var durationMs = phrase.notes[i].durationMs; var currentLyric = phrase.notes[i].lyric.Normalize(); var lyricList = currentLyric.Split(" "); var shiftTone = phrase.phones[0].toneShift; - foreach (string lyric in lyricList) { - if (!VoicevoxUtils.IsSyllableVowelExtensionNote(lyric)) { - if (VoicevoxUtils.IsPau(lyric)) { - currentLyric = string.Empty; - } else if (VoicevoxUtils.dic.IsDic(lyric)) { - currentLyric = VoicevoxUtils.dic.Lyrictodic(lyric); - } else if (VoicevoxUtils.phoneme_List.kanas.ContainsKey(lyric)) { - currentLyric = lyric; - } else { - currentLyric = string.Empty; - } + if (!VoicevoxUtils.IsSyllableVowelExtensionNote(lyricList[^1])) { + if (VoicevoxUtils.IsPau(lyricList[^1])) { + currentLyric = string.Empty; + } else if (VoicevoxUtils.dic.IsDic(lyricList[^1])) { + currentLyric = VoicevoxUtils.dic.Lyrictodic(lyricList[^1]); + } else if (VoicevoxUtils.phoneme_List.kanas.ContainsKey(lyricList[^1])) { + currentLyric = lyricList[^1]; + } else { + currentLyric = string.Empty; } } - vnotes.Add(new VoicevoxNote() { + vNotes.Add(new VoicevoxNote() { lyric = currentLyric, positionMs = phrase.notes[i].positionMs, durationMs = durationMs, tone = (int)(phrase.notes[i].tone + shiftTone) }); } - //} } catch (Exception e) { + var phonemeText = string.Join(" ", phrase.phones.Select(p => p.phoneme)); throw new MessageCustomizableException( - $"Failed to create a voice base. The phoneme is not supported by the VOICEVOX engine.\n{string.Join(" ", phrase.phones.Select(p => p.phoneme))}", - $"You are confusing phonemes and hiragana.\n{string.Join(" ", phrase.phones.Select(p => p.phoneme))}", new VoicevoxException()); + $"Failed to create a voice base. One or more phonemes may not be supported by the VOICEVOX engine.\n{phonemeText}", + $"An error occurred while creating a voice base from the current phrase. This may be caused by unsupported phonemes, invalid input, or a mismatch between phonemes and hiragana.\nPhonemes: {phonemeText}\nDetails: {e.Message}", + new VoicevoxException()); } - return vnotes; + return vNotes; } private bool IsPhonemeNoteCountMatch(RenderPhrase phrase) { diff --git a/OpenUtau.Core/Voicevox/VoicevoxUtils.cs b/OpenUtau.Core/Voicevox/VoicevoxUtils.cs index e3abf4b21..00bb09102 100644 --- a/OpenUtau.Core/Voicevox/VoicevoxUtils.cs +++ b/OpenUtau.Core/Voicevox/VoicevoxUtils.cs @@ -184,17 +184,17 @@ public static class VoicevoxUtils { // Phonemes and dictionaries public static Dictionary_list dic = new Dictionary_list(); public static Phoneme_list phoneme_List = new Phoneme_list(); - - private static bool TryParseJson(string json, out JToken token) { - try { - token = JToken.Parse(json); - return true; - } catch (JsonReaderException ex) { - //Log.Error($"Invalid JSON: {ex.Message}"); - token = null; - return false; - } - } + // JSON parse helper for testing + //private static bool TryParseJson(string json, out JToken token) { + // try { + // token = JToken.Parse(json); + // return true; + // } catch (JsonReaderException ex) { + // Log.Error($"Invalid JSON: {ex.Message}"); + // token = null; + // return false; + // } + //} public static bool InitializedSpeaker(string id, bool skipReinit = false) { var queryurl = new VoicevoxURL() { method = "GET", path = "/is_initialized_speaker", query = new Dictionary { { "speaker", id } } }; @@ -203,21 +203,18 @@ public static bool InitializedSpeaker(string id, bool skipReinit = false) { if (jObj.ContainsKey("detail")) { Log.Error($"Response was incorrect. : {jObj}"); return false; - } else if (jObj.ContainsKey("json")) { - if (jObj.TryGetValue("json", out var jsonToken)) { - if (!jsonToken.Value()) { - queryurl = new VoicevoxURL() { method = "POST", path = "/initialize_speaker", query = new Dictionary { { "speaker", id }, { "skip_reinit", skipReinit.ToString() } } }; - response = VoicevoxClient.Inst.SendRequest(queryurl); - if (TryParseJson(response.Item1, out var token)) { - if (jObj.ContainsKey("detail")) { - Log.Error($"Response was incorrect. : {jObj}"); - } - return false; - } - return true; - } else { - return true; + } else if (jObj.TryGetValue("json", out var jsonToken)) { + if (!jsonToken.Value()) { + queryurl = new VoicevoxURL() { method = "POST", path = "/initialize_speaker", query = new Dictionary { { "speaker", id }, { "skip_reinit", skipReinit.ToString() } } }; + response = VoicevoxClient.Inst.SendRequest(queryurl); + jObj = JObject.Parse(response.Item1); + if (jObj.ContainsKey("detail")) { + Log.Error($"Response was incorrect. : {jObj}"); + return false; } + return true; + } else { + return true; } } return false; @@ -257,18 +254,15 @@ public static VoicevoxQueryMain NoteGroupsToVQuery(VoicevoxNote[] vNotes, TimeAx while (index < vNotes.Length) { string lyric = dic.Notetodic(vNotes, index); double durationMs = vNotes[index].durationMs; - // TODO: slur support - if (IsSyllableVowelExtensionNote(vNotes[index].lyric)) { - // ピッチ生成でスラーを考慮する場合、母音伸ばしノートは前のノートのカナを引き継ぐ - if (pitch_slur) { - if (vNotes.Length >= index - 1 && 0 <= index - 1) { - if (VoicevoxUtils.phoneme_List.kanas.TryGetValue(vNotes[index - 1].lyric, out string str)) { - lyric = str; - slur_index++; - } - } else { - slur_index = 0; + // When slurs are considered in pitch generation, vowel-stretched notes inherit the Kana of the previous note + if (IsSyllableVowelExtensionNote(vNotes[index].lyric) && pitch_slur) { + if (index > 0) { + if (VoicevoxUtils.phoneme_List.kanas.TryGetValue(vNotes[index - 1].lyric, out string str)) { + lyric = str; + slur_index++; } + } else { + slur_index = 0; } } int length = (int)Math.Round((durationMs / 1000f) * VoicevoxUtils.fps, MidpointRounding.AwayFromZero); @@ -286,12 +280,10 @@ public static VoicevoxQueryMain NoteGroupsToVQuery(VoicevoxNote[] vNotes, TimeAx short_length_count += 1; } } - if (IsSyllableVowelExtensionNote(vNotes[index].lyric)) { - if (!pitch_slur) { - //通常合成ではスラー分の長さを前のノートに足す - vqMain.notes[index].frame_length += length; - break; - } + //Usually synthesis adds the length of the slur to the previous note. + if (IsSyllableVowelExtensionNote(vNotes[index].lyric) && !pitch_slur) { + vqMain.notes[index].frame_length += length; + continue; } //Set tone to null if lyric is empty @@ -299,7 +291,7 @@ public static VoicevoxQueryMain NoteGroupsToVQuery(VoicevoxNote[] vNotes, TimeAx if (!string.IsNullOrEmpty(lyric)) { tone = vNotes[index].tone; } else { - // 明示的に空文字を設定する。 + // Explicitly set to empty string. lyric = ""; } vqMain.notes.Add(new VoicevoxQueryNotes { @@ -439,10 +431,9 @@ public static string getBaseSingerID(VoicevoxSinger singer) { return defaultID; } foreach (var s in singer.voicevoxConfig.base_singer_style) { - if (s.name.Equals(singer.voicevoxConfig.base_singer_name)) { - if (s.styles.name.Equals(singer.voicevoxConfig.base_singer_style_name)) { - return s.styles.id.ToString(); - } + if (s.name.Equals(singer.voicevoxConfig.base_singer_name) + && s.styles.name.Equals(singer.voicevoxConfig.base_singer_style_name)) { + return s.styles.id.ToString(); } } return defaultID; From aad62167dec503aa4534b764b194de2e58975dfd Mon Sep 17 00:00:00 2001 From: rokujyushi Date: Sat, 24 Jan 2026 19:11:36 +0900 Subject: [PATCH 3/4] Improved lyrics segmentation and decision logic and organized conditional branching Modified the process to be similar to VoicevoxRenderer.BuildVNotes Corrected function names to refer to functions before modification --- .../Voicevox/Phonemizers/VoicevoxPhonemizer.cs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/OpenUtau.Core/Voicevox/Phonemizers/VoicevoxPhonemizer.cs b/OpenUtau.Core/Voicevox/Phonemizers/VoicevoxPhonemizer.cs index 75bdeb787..0a5eee8fb 100644 --- a/OpenUtau.Core/Voicevox/Phonemizers/VoicevoxPhonemizer.cs +++ b/OpenUtau.Core/Voicevox/Phonemizers/VoicevoxPhonemizer.cs @@ -22,15 +22,14 @@ public override void SetUp(Note[][] notes, UProject project, UTrack track) { for (int i = 0; i < notes.Length; i++) { var currentLyric = notes[i][0].lyric.Normalize(); var lyricList = currentLyric.Split(" "); - if (lyricList.Length > 1) { - currentLyric = lyricList[1]; - } - if (!VoicevoxUtils.IsSyllableVowelExtensionNote(currentLyric)) { - if (VoicevoxUtils.IsDicPau(currentLyric)) { + if (!VoicevoxUtils.IsSyllableVowelExtensionNote(lyricList[^1])) { + if (VoicevoxUtils.IsPau(lyricList[^1])) { + currentLyric = string.Empty; + } else if (VoicevoxUtils.dic.IsDic(lyricList[^1])) { + currentLyric = VoicevoxUtils.dic.Lyrictodic(lyricList[^1]); + } else if (!VoicevoxUtils.IsKana(lyricList[^1])) { currentLyric = string.Empty; - } else if (VoicevoxUtils.dic.IsDic(currentLyric)) { - currentLyric = VoicevoxUtils.dic.Lyrictodic(currentLyric); - } else if (!VoicevoxUtils.IsDicKana(currentLyric)) { + } else { currentLyric = string.Empty; } } From d5a3498d277343aa2b35a9b3626ec90458322787 Mon Sep 17 00:00:00 2001 From: rokujyushi Date: Sat, 24 Jan 2026 19:22:15 +0900 Subject: [PATCH 4/4] Changed exception log output to messages only The log output when an exception occurs has been changed from the entire exception object to only the exception message (e.Message). This makes the log more concise and suppresses the output of unnecessary information. --- OpenUtau.Core/Voicevox/VoicevoxRenderer.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OpenUtau.Core/Voicevox/VoicevoxRenderer.cs b/OpenUtau.Core/Voicevox/VoicevoxRenderer.cs index c2ca70b85..77d3ca880 100644 --- a/OpenUtau.Core/Voicevox/VoicevoxRenderer.cs +++ b/OpenUtau.Core/Voicevox/VoicevoxRenderer.cs @@ -142,7 +142,7 @@ public Task Render(RenderPhrase phrase, Progress progress, int tra if (e is VoicevoxException) { throw new MessageCustomizableException("Failed to create the audio.", "Failed to create the audio.", e); } else { - Log.Error($"Failed to create the audio.{e}"); + Log.Error(e.Message); } } if (cancellation.IsCancellationRequested) {