-
Notifications
You must be signed in to change notification settings - Fork 465
Enhanced Slur Expression and Added F0 Correction Processing in VOICEVOX #1918
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
b38766c
2ad1977
d0ee4b3
b8f918c
aad6216
0bc662d
d5a3498
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -4,13 +4,15 @@ | |||
| using System.Linq; | ||||
| using System.Threading; | ||||
| using System.Threading.Tasks; | ||||
| using K4os.Hash.xxHash; | ||||
| using NAudio.Wave; | ||||
| using Newtonsoft.Json; | ||||
| using Newtonsoft.Json.Linq; | ||||
| using OpenUtau.Core.Format; | ||||
| using OpenUtau.Core.Render; | ||||
| using OpenUtau.Core.Ustx; | ||||
| using Serilog; | ||||
| using SharpCompress; | ||||
| using ThirdParty; | ||||
|
|
||||
| /* | ||||
|
|
@@ -65,7 +67,8 @@ public Task<RenderResult> Render(RenderPhrase phrase, Progress progress, int tra | |||
| } | ||||
| string progressInfo = $"Track {trackNo + 1}: {this} \"{string.Join(" ", phrase.phones.Select(p => p.phoneme))}\""; | ||||
| progress.Complete(0, progressInfo); | ||||
| var wavPath = Path.Join(PathManager.Inst.CachePath, $"vv-{phrase.hash:x16}.wav"); | ||||
| ulong hash = HashPhraseGroups(phrase); | ||||
| var wavPath = Path.Join(PathManager.Inst.CachePath, $"vv-{phrase.hash:x16}_{hash:x8}.wav"); | ||||
| phrase.AddCacheFile(wavPath); | ||||
| var result = Layout(phrase); | ||||
| if (!File.Exists(wavPath)) { | ||||
|
|
@@ -75,7 +78,8 @@ public Task<RenderResult> Render(RenderPhrase phrase, Progress progress, int tra | |||
| VoicevoxUtils.Loaddic(singer); | ||||
| } | ||||
| try { | ||||
| VoicevoxSynthParams vsParams = PhraseToVoicevoxSynthParams(phrase, phrase.singer as VoicevoxSinger); | ||||
| Log.Information($"Starting Voicevox synthesis"); | ||||
| VoicevoxSynthParams vsParams = PhraseToVoicevoxSynthParams(phrase, phrase.singer as VoicevoxSinger, false); | ||||
|
|
||||
| int vvTotalFrames = 0; | ||||
| double frameMs = (1000d / VoicevoxUtils.fps); | ||||
|
|
@@ -106,18 +110,21 @@ public Task<RenderResult> Render(RenderPhrase phrase, Progress progress, int tra | |||
| result.positionMs = phrase.positionMs - phrase.timeAxis.TickPosToMsPos((vsParams.phonemes.First().frame_length / VoicevoxUtils.fps) * 1000d); | ||||
| } | ||||
|
|
||||
| int speaker = 0; | ||||
| int speakerID = 0; | ||||
| singer.voicevoxConfig.styles.ForEach(style => { | ||||
| if (style.name.Equals(phrase.singer.Subbanks[1].Suffix) && style.type.Equals("frame_decode")) { | ||||
| speaker = style.id; | ||||
| speakerID = style.id; | ||||
| } | ||||
| // Apply the voice color setting value | ||||
| if (style.name.Equals(phrase.phones[0].suffix) && style.type.Equals("frame_decode")) { | ||||
| speaker = style.id; | ||||
| } else if ((style.name + "_" + style.type).Equals(phrase.phones[0].suffix)) { | ||||
| speaker = style.id; | ||||
| speakerID = style.id; | ||||
| } else // Supports styles with the same name but different types | ||||
| if ((style.name + "_" + style.type).Equals(phrase.phones[0].suffix)) { | ||||
| speakerID = style.id; | ||||
| } | ||||
| }); | ||||
| var queryurl = new VoicevoxURL() { method = "POST", path = "/frame_synthesis", query = new Dictionary<string, string> { { "speaker", speaker.ToString() } }, body = JsonConvert.SerializeObject(vsParams), accept = "audio/wav" }; | ||||
| VoicevoxUtils.InitializedSpeaker(speakerID.ToString(), false); | ||||
| var queryurl = new VoicevoxURL() { method = "POST", path = "/frame_synthesis", query = new Dictionary<string, string> { { "speaker", speakerID.ToString() } }, body = JsonConvert.SerializeObject(vsParams), accept = "audio/wav" }; | ||||
| var response = VoicevoxClient.Inst.SendRequest(queryurl); | ||||
| byte[] bytes = null; | ||||
| if (!response.Item2.Equals(null)) { | ||||
|
|
@@ -132,7 +139,11 @@ public Task<RenderResult> Render(RenderPhrase phrase, Progress progress, int tra | |||
| File.WriteAllBytes(wavPath, bytes); | ||||
| } | ||||
| } catch (Exception e) { | ||||
| Log.Error(e, "Failed to create a voice base."); | ||||
| if (e is VoicevoxException) { | ||||
| throw new MessageCustomizableException("Failed to create the audio.", "Failed to create the audio.", e); | ||||
| } else { | ||||
| Log.Error(e.Message); | ||||
rokujyushi marked this conversation as resolved.
Show resolved
Hide resolved
|
||||
| } | ||||
| } | ||||
| if (cancellation.IsCancellationRequested) { | ||||
| return new RenderResult(); | ||||
|
|
@@ -155,85 +166,34 @@ public Task<RenderResult> Render(RenderPhrase phrase, Progress progress, int tra | |||
| return task; | ||||
| } | ||||
|
|
||||
| private VoicevoxSynthParams PhraseToVoicevoxSynthParams(RenderPhrase phrase, VoicevoxSinger singer) { | ||||
| private VoicevoxSynthParams PhraseToVoicevoxSynthParams(RenderPhrase phrase, VoicevoxSinger singer, bool pitch_slur) { | ||||
|
|
||||
| VoicevoxSynthParams vsParams = new VoicevoxSynthParams(); | ||||
| //Prepare for future additions of Teacher Singer. | ||||
| string baseSingerID = VoicevoxUtils.getBaseSingerID(singer); | ||||
|
|
||||
| if (phrase.phones.All(p => VoicevoxUtils.IsDicKana(p.phoneme) || VoicevoxUtils.IsDicPau(p.phoneme))) { | ||||
| // TODO: slur support | ||||
| List<VoicevoxNote> vnotes = new List<VoicevoxNote>(); | ||||
| //if (slur) { | ||||
| for (int i = 0; i < phrase.phones.Length; i++) { | ||||
| vnotes.Add(new VoicevoxNote() { | ||||
| lyric = phrase.phones[i].phoneme, | ||||
| positionMs = phrase.phones[i].positionMs, | ||||
| durationMs = phrase.phones[i].durationMs, | ||||
| tone = (int)(phrase.phones[i].tone + phrase.phones[i].toneShift) | ||||
| }); | ||||
| } | ||||
| //} else { | ||||
| //} | ||||
|
|
||||
| VoicevoxQueryMain vqMain = VoicevoxUtils.NoteGroupsToVQuery(vnotes.ToArray(), phrase.timeAxis); | ||||
|
|
||||
| VoicevoxUtils.InitializedSpeaker(baseSingerID, true); | ||||
| List<VoicevoxNote> vNotes = BuildVNotes(phrase, baseSingerID); | ||||
|
|
||||
| //Match the phonemes in the synthesis parameters to the scores in the score to update F0 and volume | ||||
| //Create parameters for the update source. | ||||
| VoicevoxQueryMain vqMain = VoicevoxUtils.NoteGroupsToVQuery(vNotes.ToArray(), phrase.timeAxis); | ||||
| VoicevoxSynthParams vsParams; | ||||
| if (IsPhonemeNoteCountMatch(phrase)) { | ||||
| vsParams = VoicevoxUtils.VoicevoxVoiceBase(vqMain, baseSingerID); | ||||
| } else if (phrase.phones.All(p => VoicevoxUtils.IsVowel(p.phoneme) || VoicevoxUtils.IsConsonant(p.phoneme))) { | ||||
| List<VoicevoxNote> vnotes = new List<VoicevoxNote>(); | ||||
| for (int i = 0; i < phrase.notes.Length; i++) { | ||||
| var durationMs = phrase.notes[i].durationMs; | ||||
| var currentLyric = phrase.notes[i].lyric.Normalize(); | ||||
| var lyricList = currentLyric.Split(" "); | ||||
| var shiftTone = phrase.phones[0].toneShift; | ||||
| if (lyricList.Length > 1) { | ||||
| currentLyric = lyricList[1]; | ||||
| } | ||||
| if (!VoicevoxUtils.IsSyllableVowelExtensionNote(currentLyric)) { | ||||
| if (VoicevoxUtils.IsDicPau(currentLyric)) { | ||||
| currentLyric = string.Empty; | ||||
| } else if (VoicevoxUtils.dic.IsDic(currentLyric)) { | ||||
| currentLyric = VoicevoxUtils.dic.Lyrictodic(currentLyric); | ||||
| } else if (!VoicevoxUtils.IsDicKana(currentLyric)) { | ||||
| currentLyric = string.Empty; | ||||
| } | ||||
| } else if (vnotes.Count >= i - 1 && 0 <= i - 1) { | ||||
| // TODO: slur support | ||||
| //if (slur) { | ||||
| var tempNote = vnotes[vnotes.Count - 1]; | ||||
| tempNote.durationMs += durationMs; | ||||
| vnotes[vnotes.Count - 1] = tempNote; | ||||
| continue; | ||||
| //} else { | ||||
| // if (VoicevoxUtils.phoneme_List.kanas.TryGetValue(vnotes[i - 1].lyric, out string str)) { | ||||
| // currentLyric = str; | ||||
| // } | ||||
| //} | ||||
| } | ||||
| vnotes.Add(new VoicevoxNote() { | ||||
| lyric = currentLyric, | ||||
| positionMs = phrase.notes[i].positionMs, | ||||
| durationMs = durationMs, | ||||
| tone = (int)(phrase.notes[i].tone + shiftTone) | ||||
| }); | ||||
| } | ||||
| //Match the phonemes in the synthesis parameters to the scores in the score to update F0 and volume | ||||
| //Create parameters for the update source. | ||||
| VoicevoxQueryMain vqMain = VoicevoxUtils.NoteGroupsToVQuery(vnotes.ToArray(), phrase.timeAxis); | ||||
| VoicevoxSynthParams vsParams_1 = VoicevoxUtils.VoicevoxVoiceBase(vqMain, baseSingerID); | ||||
| } else { | ||||
| //vsParamsServer is a parameter to hold phonemes generated from note lyrics | ||||
| VoicevoxSynthParams vsParamsServer = VoicevoxUtils.VoicevoxVoiceBase(vqMain, baseSingerID); | ||||
|
|
||||
| //Create parameters for the update destination. | ||||
| vsParams = PhonemeToVoicevoxSynthParams(phrase); | ||||
| VoicevoxSynthParams vsParams_2 = vsParams.Clone(); | ||||
|
|
||||
|
|
||||
| if (vsParams.phonemes.Count == vsParams_1.phonemes.Count) { | ||||
| for (int i = 0; i < vsParams_1.phonemes.Count; i++) { | ||||
| //vsParamsUser is a parameter to hold phonemes changed by the user | ||||
| VoicevoxSynthParams vsParamsUser = vsParams.Clone(); | ||||
| if (vsParams.phonemes.Count == vsParamsServer.phonemes.Count) { | ||||
| for (int i = 0; i < vsParamsServer.phonemes.Count; i++) { | ||||
| // TODO: Develop a VOICEVOX engine dedicated to OpenUtau so that synthesis parameters are updated when phonemes are changed. | ||||
| //var flag = phrase.phones[i].flags.FirstOrDefault(f => f.Item1 == VoicevoxUtils.REPM); | ||||
| //if (flag != null) { | ||||
| // if (flag.Item3.Equals(VoicevoxUtils.REPLACE)) { | ||||
| vsParams.phonemes[i].phoneme = vsParams_1.phonemes[i].phoneme; | ||||
| vsParams.phonemes[i].phoneme = vsParamsServer.phonemes[i].phoneme; | ||||
| // } | ||||
| //} | ||||
| } | ||||
|
|
@@ -242,22 +202,62 @@ private VoicevoxSynthParams PhraseToVoicevoxSynthParams(RenderPhrase phrase, Voi | |||
| vsParams.f0 = VoicevoxUtils.QueryToF0(vqMain, vsParams, baseSingerID); | ||||
| vsParams.volume = VoicevoxUtils.QueryToVolume(vqMain, vsParams, baseSingerID); | ||||
| //Update phoneme | ||||
| for (int i = 0; i < vsParams_2.phonemes.Count; i++) { | ||||
| for (int i = 0; i < vsParamsUser.phonemes.Count; i++) { | ||||
| //var flag = phrase.phones[i].flags.FirstOrDefault(f => f.Item1 == VoicevoxUtils.REPM); | ||||
| //if (flag != null) { | ||||
| // if (flag.Item3.Equals(VoicevoxUtils.REPLACE)) { | ||||
| vsParams.phonemes[i].phoneme = vsParams_2.phonemes[i].phoneme; | ||||
| vsParams.phonemes[i].phoneme = vsParamsUser.phonemes[i].phoneme; | ||||
| // } | ||||
| //} | ||||
| } | ||||
| } else { | ||||
| throw new MessageCustomizableException( | ||||
| $"Failed to create a voice base. The phoneme is not supported by the VOICEVOX engine.\n{string.Join(" ", phrase.phones.Select(p => p.phoneme))}", | ||||
| $"You are confusing phonemes and hiragana.\n{string.Join(" ", phrase.phones.Select(p => p.phoneme))}", new VoicevoxException()); | ||||
| } | ||||
| if (pitch_slur) { | ||||
| VoicevoxUtils.AdjustF0ForSlur(VoicevoxUtils.NoteGroupsToVQuery(vNotes.ToArray(), phrase.timeAxis, true), vsParams.f0); | ||||
| } | ||||
| return vsParams; | ||||
| } | ||||
|
|
||||
| private List<VoicevoxNote> BuildVNotes(RenderPhrase phrase, string baseSingerID) { | ||||
| List<VoicevoxNote> vNotes = new List<VoicevoxNote>(); | ||||
| try { | ||||
| for (int i = 0; i < phrase.notes.Length; i++) { | ||||
| var durationMs = phrase.notes[i].durationMs; | ||||
|
Comment on lines
+220
to
+224
|
||||
| var currentLyric = phrase.notes[i].lyric.Normalize(); | ||||
| var lyricList = currentLyric.Split(" "); | ||||
| var shiftTone = phrase.phones[0].toneShift; | ||||
| if (!VoicevoxUtils.IsSyllableVowelExtensionNote(lyricList[^1])) { | ||||
| if (VoicevoxUtils.IsPau(lyricList[^1])) { | ||||
| currentLyric = string.Empty; | ||||
| } else if (VoicevoxUtils.dic.IsDic(lyricList[^1])) { | ||||
| currentLyric = VoicevoxUtils.dic.Lyrictodic(lyricList[^1]); | ||||
| } else if (VoicevoxUtils.phoneme_List.kanas.ContainsKey(lyricList[^1])) { | ||||
| currentLyric = lyricList[^1]; | ||||
| } else { | ||||
| currentLyric = string.Empty; | ||||
| } | ||||
| } | ||||
| vNotes.Add(new VoicevoxNote() { | ||||
| lyric = currentLyric, | ||||
| positionMs = phrase.notes[i].positionMs, | ||||
| durationMs = durationMs, | ||||
| tone = (int)(phrase.notes[i].tone + shiftTone) | ||||
| }); | ||||
| } | ||||
| } catch (Exception e) { | ||||
| var phonemeText = string.Join(" ", phrase.phones.Select(p => p.phoneme)); | ||||
| throw new MessageCustomizableException( | ||||
| $"Failed to create a voice base. One or more phonemes may not be supported by the VOICEVOX engine.\n{phonemeText}", | ||||
| $"An error occurred while creating a voice base from the current phrase. This may be caused by unsupported phonemes, invalid input, or a mismatch between phonemes and hiragana.\nPhonemes: {phonemeText}\nDetails: {e.Message}", | ||||
| new VoicevoxException()); | ||||
| } | ||||
|
|
||||
| return vNotes; | ||||
| } | ||||
|
|
||||
| private bool IsPhonemeNoteCountMatch(RenderPhrase phrase) { | ||||
| return phrase.phones.Length == phrase.notes.Where(note => !VoicevoxUtils.IsSyllableVowelExtensionNote(note.lyric)).Count() && phrase.phones.All(p => VoicevoxUtils.phoneme_List.kanas.ContainsKey(p.phoneme)); | ||||
| } | ||||
|
|
||||
| private VoicevoxSynthParams PhonemeToVoicevoxSynthParams(RenderPhrase phrase) { | ||||
| VoicevoxSynthParams vsParams = new VoicevoxSynthParams(); | ||||
| int headFrames = (int)Math.Round((VoicevoxUtils.headS * VoicevoxUtils.fps), MidpointRounding.AwayFromZero); | ||||
|
|
@@ -407,7 +407,7 @@ RenderPitchResult IRenderer.LoadRenderedPitch(RenderPhrase phrase) { | |||
| if (singer != null) { | ||||
|
|
||||
| string baseSingerID = VoicevoxUtils.getBaseSingerID(singer); | ||||
|
||||
| string baseSingerID = VoicevoxUtils.getBaseSingerID(singer); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In this mapping logic, the
elsebranch setscurrentLyrictostring.Emptyeven whenlyricList[^1]is a valid Kana and not in the dictionary, which means plain Kana lyrics are discarded instead of being passed through. This likely should keep the Kana value (as done inBuildVNotesinVoicevoxRenderer) so that non-dictionary Kana notes are still synthesized rather than treated as empty/rest notes.