From 80c7dfae9cf5f95b64e5133eb99559207c7b6f4b Mon Sep 17 00:00:00 2001 From: Muhammad Kashif Khan Date: Mon, 9 Sep 2024 14:54:30 +0500 Subject: [PATCH 1/2] update the deepgram package --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 595b0a29..9f9b062f 100644 --- a/package.json +++ b/package.json @@ -14,7 +14,7 @@ "author": "Charlie Weems", "license": "MIT", "dependencies": { - "@deepgram/sdk": "^3.3.4", + "@deepgram/sdk": "^3.6.0", "colors": "^1.4.0", "dotenv": "^16.3.1", "express": "^4.19.2", From d88ad42b4aa82a35107611636d85cb27080d6239 Mon Sep 17 00:00:00 2001 From: Muhammad Kashif Khan Date: Mon, 9 Sep 2024 14:59:57 +0500 Subject: [PATCH 2/2] fix issue if agent and user quite too long due to background noise --- app.js | 14 ++++++++++++-- services/transcription-service.js | 13 ++++++++++--- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/app.js b/app.js index c862debc..ac13003f 100644 --- a/app.js +++ b/app.js @@ -36,6 +36,7 @@ app.ws('/connection', (ws) => { // Filled in from start message let streamSid; let callSid; + let globalInterval; const gptService = new GptService(); const streamService = new StreamService(ws); @@ -71,9 +72,9 @@ app.ws('/connection', (ws) => { } }); - transcriptionService.on('utterance', async (text) => { + transcriptionService.on('utterance', async ({ text, duration, start }) => { // This is a bit of a hack to filter out empty utterances - if(marks.length > 0 && text?.length > 5) { + if(marks.length > 0 && text?.trim()?.length) { console.log('Twilio -> Interruption, Clearing stream'.red); ws.send( JSON.stringify({ @@ -81,18 +82,26 @@ app.ws('/connection', (ws) => { event: 'clear', }) ); + } else if (duration > 4 && !text?.trim()?.length && !marks.length) { + globalInterval = setInterval(() => { + console.log('Interval running...'); + }, 2000); + console.log(`durration=${duration}, start=${start}`.red); + ttsService.generate({partialResponseIndex: null, partialResponse: 'sorry! are you there i dont listen anything?'}, ++interactionCount); } }); transcriptionService.on('transcription', async (text) => { if (!text) { return; } console.log(`Interaction ${interactionCount} – STT -> GPT: ${text}`.yellow); + clearInterval(globalInterval); gptService.completion(text, interactionCount); interactionCount += 1; }); gptService.on('gptreply', async (gptReply, icount) => { console.log(`Interaction ${icount}: GPT -> TTS: ${gptReply.partialResponse}`.green ); + clearInterval(globalInterval); ttsService.generate(gptReply, icount); }); @@ -104,6 +113,7 @@ app.ws('/connection', (ws) => { streamService.on('audiosent', (markLabel) => { marks.push(markLabel); + clearInterval(globalInterval); }); } catch (err) { console.log(err); diff --git a/services/transcription-service.js b/services/transcription-service.js index 578fd80b..4cab33a9 100644 --- a/services/transcription-service.js +++ b/services/transcription-service.js @@ -15,7 +15,9 @@ class TranscriptionService extends EventEmitter { punctuate: true, interim_results: true, endpointing: 200, - utterance_end_ms: 1000 + utterance_end_ms: 1000, + smart_format: true, + vad_events: true, }); this.finalResult = ''; @@ -44,7 +46,7 @@ class TranscriptionService extends EventEmitter { // console.log(text, "is_final: ", transcription?.is_final, "speech_final: ", transcription.speech_final); // if is_final that means that this chunk of the transcription is accurate and we need to add it to the finalResult if (transcriptionEvent.is_final === true && text.trim().length > 0) { - this.finalResult += ` ${text}`; + this.finalResult += ` ${text.trim()}`; // if speech_final and is_final that means this text is accurate and it's a natural pause in the speakers speech. We need to send this to the assistant for processing if (transcriptionEvent.speech_final === true) { this.speechFinal = true; // this will prevent a utterance end which shows up after speechFinal from sending another response @@ -55,7 +57,12 @@ class TranscriptionService extends EventEmitter { this.speechFinal = false; } } else { - this.emit('utterance', text); + console.log(`STT -> Deepgram transcription: ${text}`.yellow); + this.emit('utterance', { + text: text, + duration: transcriptionEvent.duration, + start: transcriptionEvent.start, + }); } });