Skip to content
15 changes: 7 additions & 8 deletions OpenUtau.Core/Voicevox/Phonemizers/VoicevoxPhonemizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,14 @@ public override void SetUp(Note[][] notes, UProject project, UTrack track) {
for (int i = 0; i < notes.Length; i++) {
var currentLyric = notes[i][0].lyric.Normalize();
var lyricList = currentLyric.Split(" ");
if (lyricList.Length > 1) {
currentLyric = lyricList[1];
}
if (!VoicevoxUtils.IsSyllableVowelExtensionNote(currentLyric)) {
if (VoicevoxUtils.IsDicPau(currentLyric)) {
if (!VoicevoxUtils.IsSyllableVowelExtensionNote(lyricList[^1])) {
if (VoicevoxUtils.IsPau(lyricList[^1])) {
currentLyric = string.Empty;
} else if (VoicevoxUtils.dic.IsDic(lyricList[^1])) {
currentLyric = VoicevoxUtils.dic.Lyrictodic(lyricList[^1]);
} else if (!VoicevoxUtils.IsKana(lyricList[^1])) {
currentLyric = string.Empty;
} else if (VoicevoxUtils.dic.IsDic(currentLyric)) {
currentLyric = VoicevoxUtils.dic.Lyrictodic(currentLyric);
} else if (!VoicevoxUtils.IsDicKana(currentLyric)) {
} else {
currentLyric = string.Empty;
Copy link

Copilot AI Jan 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this mapping logic, the else branch sets currentLyric to string.Empty even when lyricList[^1] is a valid Kana and not in the dictionary, which means plain Kana lyrics are discarded instead of being passed through. This likely should keep the Kana value (as done in BuildVNotes in VoicevoxRenderer) so that non-dictionary Kana notes are still synthesized rather than treated as empty/rest notes.

Suggested change
currentLyric = string.Empty;
currentLyric = lyricList[^1];

Copilot uses AI. Check for mistakes.
}
}
Expand Down
186 changes: 102 additions & 84 deletions OpenUtau.Core/Voicevox/VoicevoxRenderer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
using K4os.Hash.xxHash;
using NAudio.Wave;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using OpenUtau.Core.Format;
using OpenUtau.Core.Render;
using OpenUtau.Core.Ustx;
using Serilog;
using SharpCompress;
using ThirdParty;

/*
Expand Down Expand Up @@ -65,7 +67,8 @@ public Task<RenderResult> Render(RenderPhrase phrase, Progress progress, int tra
}
string progressInfo = $"Track {trackNo + 1}: {this} \"{string.Join(" ", phrase.phones.Select(p => p.phoneme))}\"";
progress.Complete(0, progressInfo);
var wavPath = Path.Join(PathManager.Inst.CachePath, $"vv-{phrase.hash:x16}.wav");
ulong hash = HashPhraseGroups(phrase);
var wavPath = Path.Join(PathManager.Inst.CachePath, $"vv-{phrase.hash:x16}_{hash:x8}.wav");
phrase.AddCacheFile(wavPath);
var result = Layout(phrase);
if (!File.Exists(wavPath)) {
Expand All @@ -75,7 +78,8 @@ public Task<RenderResult> Render(RenderPhrase phrase, Progress progress, int tra
VoicevoxUtils.Loaddic(singer);
}
try {
VoicevoxSynthParams vsParams = PhraseToVoicevoxSynthParams(phrase, phrase.singer as VoicevoxSinger);
Log.Information($"Starting Voicevox synthesis");
VoicevoxSynthParams vsParams = PhraseToVoicevoxSynthParams(phrase, phrase.singer as VoicevoxSinger, false);

int vvTotalFrames = 0;
double frameMs = (1000d / VoicevoxUtils.fps);
Expand Down Expand Up @@ -106,18 +110,21 @@ public Task<RenderResult> Render(RenderPhrase phrase, Progress progress, int tra
result.positionMs = phrase.positionMs - phrase.timeAxis.TickPosToMsPos((vsParams.phonemes.First().frame_length / VoicevoxUtils.fps) * 1000d);
}

int speaker = 0;
int speakerID = 0;
singer.voicevoxConfig.styles.ForEach(style => {
if (style.name.Equals(phrase.singer.Subbanks[1].Suffix) && style.type.Equals("frame_decode")) {
speaker = style.id;
speakerID = style.id;
}
// Apply the voice color setting value
if (style.name.Equals(phrase.phones[0].suffix) && style.type.Equals("frame_decode")) {
speaker = style.id;
} else if ((style.name + "_" + style.type).Equals(phrase.phones[0].suffix)) {
speaker = style.id;
speakerID = style.id;
} else // Supports styles with the same name but different types
if ((style.name + "_" + style.type).Equals(phrase.phones[0].suffix)) {
speakerID = style.id;
}
});
var queryurl = new VoicevoxURL() { method = "POST", path = "/frame_synthesis", query = new Dictionary<string, string> { { "speaker", speaker.ToString() } }, body = JsonConvert.SerializeObject(vsParams), accept = "audio/wav" };
VoicevoxUtils.InitializedSpeaker(speakerID.ToString(), false);
var queryurl = new VoicevoxURL() { method = "POST", path = "/frame_synthesis", query = new Dictionary<string, string> { { "speaker", speakerID.ToString() } }, body = JsonConvert.SerializeObject(vsParams), accept = "audio/wav" };
var response = VoicevoxClient.Inst.SendRequest(queryurl);
byte[] bytes = null;
if (!response.Item2.Equals(null)) {
Expand All @@ -132,7 +139,11 @@ public Task<RenderResult> Render(RenderPhrase phrase, Progress progress, int tra
File.WriteAllBytes(wavPath, bytes);
}
} catch (Exception e) {
Log.Error(e, "Failed to create a voice base.");
if (e is VoicevoxException) {
throw new MessageCustomizableException("Failed to create the audio.", "Failed to create the audio.", e);
} else {
Log.Error(e.Message);
}
}
if (cancellation.IsCancellationRequested) {
return new RenderResult();
Expand All @@ -155,85 +166,34 @@ public Task<RenderResult> Render(RenderPhrase phrase, Progress progress, int tra
return task;
}

private VoicevoxSynthParams PhraseToVoicevoxSynthParams(RenderPhrase phrase, VoicevoxSinger singer) {
private VoicevoxSynthParams PhraseToVoicevoxSynthParams(RenderPhrase phrase, VoicevoxSinger singer, bool pitch_slur) {

VoicevoxSynthParams vsParams = new VoicevoxSynthParams();
//Prepare for future additions of Teacher Singer.
string baseSingerID = VoicevoxUtils.getBaseSingerID(singer);

if (phrase.phones.All(p => VoicevoxUtils.IsDicKana(p.phoneme) || VoicevoxUtils.IsDicPau(p.phoneme))) {
// TODO: slur support
List<VoicevoxNote> vnotes = new List<VoicevoxNote>();
//if (slur) {
for (int i = 0; i < phrase.phones.Length; i++) {
vnotes.Add(new VoicevoxNote() {
lyric = phrase.phones[i].phoneme,
positionMs = phrase.phones[i].positionMs,
durationMs = phrase.phones[i].durationMs,
tone = (int)(phrase.phones[i].tone + phrase.phones[i].toneShift)
});
}
//} else {
//}

VoicevoxQueryMain vqMain = VoicevoxUtils.NoteGroupsToVQuery(vnotes.ToArray(), phrase.timeAxis);

VoicevoxUtils.InitializedSpeaker(baseSingerID, true);
List<VoicevoxNote> vNotes = BuildVNotes(phrase, baseSingerID);

//Match the phonemes in the synthesis parameters to the scores in the score to update F0 and volume
//Create parameters for the update source.
VoicevoxQueryMain vqMain = VoicevoxUtils.NoteGroupsToVQuery(vNotes.ToArray(), phrase.timeAxis);
VoicevoxSynthParams vsParams;
if (IsPhonemeNoteCountMatch(phrase)) {
vsParams = VoicevoxUtils.VoicevoxVoiceBase(vqMain, baseSingerID);
} else if (phrase.phones.All(p => VoicevoxUtils.IsVowel(p.phoneme) || VoicevoxUtils.IsConsonant(p.phoneme))) {
List<VoicevoxNote> vnotes = new List<VoicevoxNote>();
for (int i = 0; i < phrase.notes.Length; i++) {
var durationMs = phrase.notes[i].durationMs;
var currentLyric = phrase.notes[i].lyric.Normalize();
var lyricList = currentLyric.Split(" ");
var shiftTone = phrase.phones[0].toneShift;
if (lyricList.Length > 1) {
currentLyric = lyricList[1];
}
if (!VoicevoxUtils.IsSyllableVowelExtensionNote(currentLyric)) {
if (VoicevoxUtils.IsDicPau(currentLyric)) {
currentLyric = string.Empty;
} else if (VoicevoxUtils.dic.IsDic(currentLyric)) {
currentLyric = VoicevoxUtils.dic.Lyrictodic(currentLyric);
} else if (!VoicevoxUtils.IsDicKana(currentLyric)) {
currentLyric = string.Empty;
}
} else if (vnotes.Count >= i - 1 && 0 <= i - 1) {
// TODO: slur support
//if (slur) {
var tempNote = vnotes[vnotes.Count - 1];
tempNote.durationMs += durationMs;
vnotes[vnotes.Count - 1] = tempNote;
continue;
//} else {
// if (VoicevoxUtils.phoneme_List.kanas.TryGetValue(vnotes[i - 1].lyric, out string str)) {
// currentLyric = str;
// }
//}
}
vnotes.Add(new VoicevoxNote() {
lyric = currentLyric,
positionMs = phrase.notes[i].positionMs,
durationMs = durationMs,
tone = (int)(phrase.notes[i].tone + shiftTone)
});
}
//Match the phonemes in the synthesis parameters to the scores in the score to update F0 and volume
//Create parameters for the update source.
VoicevoxQueryMain vqMain = VoicevoxUtils.NoteGroupsToVQuery(vnotes.ToArray(), phrase.timeAxis);
VoicevoxSynthParams vsParams_1 = VoicevoxUtils.VoicevoxVoiceBase(vqMain, baseSingerID);
} else {
//vsParamsServer is a parameter to hold phonemes generated from note lyrics
VoicevoxSynthParams vsParamsServer = VoicevoxUtils.VoicevoxVoiceBase(vqMain, baseSingerID);

//Create parameters for the update destination.
vsParams = PhonemeToVoicevoxSynthParams(phrase);
VoicevoxSynthParams vsParams_2 = vsParams.Clone();


if (vsParams.phonemes.Count == vsParams_1.phonemes.Count) {
for (int i = 0; i < vsParams_1.phonemes.Count; i++) {
//vsParamsUser is a parameter to hold phonemes changed by the user
VoicevoxSynthParams vsParamsUser = vsParams.Clone();
if (vsParams.phonemes.Count == vsParamsServer.phonemes.Count) {
for (int i = 0; i < vsParamsServer.phonemes.Count; i++) {
// TODO: Develop a VOICEVOX engine dedicated to OpenUtau so that synthesis parameters are updated when phonemes are changed.
//var flag = phrase.phones[i].flags.FirstOrDefault(f => f.Item1 == VoicevoxUtils.REPM);
//if (flag != null) {
// if (flag.Item3.Equals(VoicevoxUtils.REPLACE)) {
vsParams.phonemes[i].phoneme = vsParams_1.phonemes[i].phoneme;
vsParams.phonemes[i].phoneme = vsParamsServer.phonemes[i].phoneme;
// }
//}
}
Expand All @@ -242,22 +202,62 @@ private VoicevoxSynthParams PhraseToVoicevoxSynthParams(RenderPhrase phrase, Voi
vsParams.f0 = VoicevoxUtils.QueryToF0(vqMain, vsParams, baseSingerID);
vsParams.volume = VoicevoxUtils.QueryToVolume(vqMain, vsParams, baseSingerID);
//Update phoneme
for (int i = 0; i < vsParams_2.phonemes.Count; i++) {
for (int i = 0; i < vsParamsUser.phonemes.Count; i++) {
//var flag = phrase.phones[i].flags.FirstOrDefault(f => f.Item1 == VoicevoxUtils.REPM);
//if (flag != null) {
// if (flag.Item3.Equals(VoicevoxUtils.REPLACE)) {
vsParams.phonemes[i].phoneme = vsParams_2.phonemes[i].phoneme;
vsParams.phonemes[i].phoneme = vsParamsUser.phonemes[i].phoneme;
// }
//}
}
} else {
throw new MessageCustomizableException(
$"Failed to create a voice base. The phoneme is not supported by the VOICEVOX engine.\n{string.Join(" ", phrase.phones.Select(p => p.phoneme))}",
$"You are confusing phonemes and hiragana.\n{string.Join(" ", phrase.phones.Select(p => p.phoneme))}", new VoicevoxException());
}
if (pitch_slur) {
VoicevoxUtils.AdjustF0ForSlur(VoicevoxUtils.NoteGroupsToVQuery(vNotes.ToArray(), phrase.timeAxis, true), vsParams.f0);
}
return vsParams;
}

private List<VoicevoxNote> BuildVNotes(RenderPhrase phrase, string baseSingerID) {
List<VoicevoxNote> vNotes = new List<VoicevoxNote>();
try {
for (int i = 0; i < phrase.notes.Length; i++) {
var durationMs = phrase.notes[i].durationMs;
Comment on lines +220 to +224
Copy link

Copilot AI Jan 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The baseSingerID parameter of BuildVNotes is never used inside the method body, which adds noise to the signature and can confuse readers about hidden dependencies. If no future use is planned, consider removing this parameter (and updating call sites) to better reflect the actual inputs the method depends on.

Copilot uses AI. Check for mistakes.
var currentLyric = phrase.notes[i].lyric.Normalize();
var lyricList = currentLyric.Split(" ");
var shiftTone = phrase.phones[0].toneShift;
if (!VoicevoxUtils.IsSyllableVowelExtensionNote(lyricList[^1])) {
if (VoicevoxUtils.IsPau(lyricList[^1])) {
currentLyric = string.Empty;
} else if (VoicevoxUtils.dic.IsDic(lyricList[^1])) {
currentLyric = VoicevoxUtils.dic.Lyrictodic(lyricList[^1]);
} else if (VoicevoxUtils.phoneme_List.kanas.ContainsKey(lyricList[^1])) {
currentLyric = lyricList[^1];
} else {
currentLyric = string.Empty;
}
}
vNotes.Add(new VoicevoxNote() {
lyric = currentLyric,
positionMs = phrase.notes[i].positionMs,
durationMs = durationMs,
tone = (int)(phrase.notes[i].tone + shiftTone)
});
}
} catch (Exception e) {
var phonemeText = string.Join(" ", phrase.phones.Select(p => p.phoneme));
throw new MessageCustomizableException(
$"Failed to create a voice base. One or more phonemes may not be supported by the VOICEVOX engine.\n{phonemeText}",
$"An error occurred while creating a voice base from the current phrase. This may be caused by unsupported phonemes, invalid input, or a mismatch between phonemes and hiragana.\nPhonemes: {phonemeText}\nDetails: {e.Message}",
new VoicevoxException());
}

return vNotes;
}

private bool IsPhonemeNoteCountMatch(RenderPhrase phrase) {
return phrase.phones.Length == phrase.notes.Where(note => !VoicevoxUtils.IsSyllableVowelExtensionNote(note.lyric)).Count() && phrase.phones.All(p => VoicevoxUtils.phoneme_List.kanas.ContainsKey(p.phoneme));
}

private VoicevoxSynthParams PhonemeToVoicevoxSynthParams(RenderPhrase phrase) {
VoicevoxSynthParams vsParams = new VoicevoxSynthParams();
int headFrames = (int)Math.Round((VoicevoxUtils.headS * VoicevoxUtils.fps), MidpointRounding.AwayFromZero);
Expand Down Expand Up @@ -407,7 +407,7 @@ RenderPitchResult IRenderer.LoadRenderedPitch(RenderPhrase phrase) {
if (singer != null) {

string baseSingerID = VoicevoxUtils.getBaseSingerID(singer);
Copy link

Copilot AI Jan 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The local variable baseSingerID is computed here but never used in the rest of LoadRenderedPitch, which can confuse readers into thinking it affects the pitch-loading logic. Consider removing this unused variable (or using it where intended) to keep the method focused on its actual dependencies.

Suggested change
string baseSingerID = VoicevoxUtils.getBaseSingerID(singer);

Copilot uses AI. Check for mistakes.
VoicevoxSynthParams vsParams = PhraseToVoicevoxSynthParams(phrase, phrase.singer as VoicevoxSinger/*, true*/);
VoicevoxSynthParams vsParams = PhraseToVoicevoxSynthParams(phrase, phrase.singer as VoicevoxSinger, true);
double frameMs = (1000d / VoicevoxUtils.fps);
int vvTotalFrames = 0;
vsParams.phonemes.ForEach(x => vvTotalFrames += x.frame_length);
Expand Down Expand Up @@ -441,5 +441,23 @@ RenderPitchResult IRenderer.LoadRenderedPitch(RenderPhrase phrase) {
}
return null;
}

ulong HashPhraseGroups(RenderPhrase phrase) {
using (var stream = new MemoryStream()) {
using (var writer = new BinaryWriter(stream)) {
writer.Write(phrase.preEffectHash);
writer.Write(phrase.phones[0].tone);
writer.Write(phrase.phones[0].direct);
if (phrase.phones[0].direct) {
writer.Write(phrase.phones[0].toneShift);
} else {
phrase.phones.ForEach(x => writer.Write(x.toneShift));
}
writer.Write(phrase.phones[0].volume);
return XXH64.DigestOf(stream.ToArray());
}
}
}

}
}
Loading
Loading