diff --git a/Assets/uCosyVoice/Runtime/Inference/FlowRunner.cs b/Assets/uCosyVoice/Runtime/Inference/FlowRunner.cs index 0d02a45..b0c2038 100644 --- a/Assets/uCosyVoice/Runtime/Inference/FlowRunner.cs +++ b/Assets/uCosyVoice/Runtime/Inference/FlowRunner.cs @@ -218,7 +218,7 @@ public Tensor ProcessWithPrompt( var xFinal = xBatch.DownloadToArray(); xBatch.Dispose(); - // Extract only the generated portion (excluding prompt) + // Extract generated portion only (first batch, skip prompt mel frames) var melData = new float[MEL_CHANNELS * generatedMelLen]; for (int c = 0; c < MEL_CHANNELS; c++) { diff --git a/Assets/uCosyVoice/Samples/TTSDemo.cs b/Assets/uCosyVoice/Samples/TTSDemo.cs index c1f1b66..aff0e83 100644 --- a/Assets/uCosyVoice/Samples/TTSDemo.cs +++ b/Assets/uCosyVoice/Samples/TTSDemo.cs @@ -61,9 +61,9 @@ private void Start() _stopButton.gameObject.SetActive(false); } - // Set default text + // Set default text (same as Python comparison test) if (_textInput != null && string.IsNullOrEmpty(_textInput.text)) - _textInput.text = "Hello, this is a test of CosyVoice text to speech synthesis."; + _textInput.text = "Hello, this is a test of CosyVoice."; SetStatus("Click 'Load Models' to initialize TTS."); SetStats(""); @@ -189,18 +189,21 @@ private void DoSynthesizeZeroShot(string text) try { - // Load prompt audio if not cached - if (_promptAudio == null) + // Always reload prompt audio to ensure fresh data + if (_promptAudioClip == null) { - if (_promptAudioClip == null) - { - SetStatus("Error: No prompt audio clip assigned in Inspector."); - return; - } + SetStatus("Error: No prompt audio clip assigned in Inspector."); + return; + } - SetStatus("Processing reference voice..."); - _promptAudio = ExtractAndResampleAudio(_promptAudioClip, 16000); - Debug.Log($"[TTSDemo] Prompt audio: {_promptAudio.Length} samples at 16kHz ({_promptAudio.Length / 16000f:F2}s)"); + SetStatus("Processing reference voice..."); + _promptAudio = ExtractAndResampleAudio(_promptAudioClip, 16000); + Debug.Log($"[TTSDemo] Prompt audio: {_promptAudio.Length} samples at 16kHz ({_promptAudio.Length / 16000f:F2}s)"); + + if (_promptAudio == null || _promptAudio.Length == 0) + { + SetStatus("Error: Failed to extract prompt audio."); + return; } SetStatus("Synthesizing with voice cloning... (UI may freeze)");