1+ import * as tf from '@tensorflow/tfjs'
2+
13import { CommandModels , SpeechContext , SpeechProcessor } from '../types'
24import { SpeechConfig , SpeechEvent , SpeechEventType } from '../../client/types'
35
46import RingBuffer from '../RingBuffer'
5- import type { Tensor } from '@tensorflow/tfjs'
67
78const defaultConfig = {
89 melLength : 10 ,
@@ -74,9 +75,9 @@ export default class WakewordTrigger implements SpeechProcessor {
7475 private models : CommandModels
7576 private hopSamples : number
7677 private sampleWindow = new RingBuffer < number > ( 0 )
77- private encodeWindow = new RingBuffer < Tensor > ( 0 )
78- private encodeState = self . tf . zeros ( [ 1 ] )
79- private frameWindow = new RingBuffer < Tensor > ( 0 )
78+ private encodeWindow = new RingBuffer < tf . Tensor > ( 0 )
79+ private encodeState = tf . zeros ( [ 1 ] )
80+ private frameWindow = new RingBuffer < tf . Tensor > ( 0 )
8081 private vadActive = false
8182
8283 static async create ( config : SpeechConfig ) {
@@ -88,7 +89,6 @@ export default class WakewordTrigger implements SpeechProcessor {
8889 }
8990
9091 constructor ( models : CommandModels , options : WakewordTriggerConfig ) {
91- const tf = self . tf
9292 const config = ( this . config = { ...defaultConfig , ...options } )
9393 this . models = models
9494
@@ -101,7 +101,7 @@ export default class WakewordTrigger implements SpeechProcessor {
101101 this . hopSamples = config . hopLength * ( config . sampleRate / 1000 )
102102 this . sampleWindow = new RingBuffer < number > ( config . fftWidth )
103103 const melSamples = ( config . melLength * config . sampleRate ) / 1000 / this . hopSamples
104- this . frameWindow = new RingBuffer < Tensor > ( melSamples )
104+ this . frameWindow = new RingBuffer < tf . Tensor > ( melSamples )
105105 const frameFill = tf . zeros ( [ config . melWidth ] )
106106 this . frameWindow . fill ( frameFill )
107107
@@ -110,7 +110,7 @@ export default class WakewordTrigger implements SpeechProcessor {
110110 const encodeLength = detectIn [ 1 ]
111111 const encodeWidth = detectIn [ detectIn . length - 1 ]
112112
113- this . encodeWindow = new RingBuffer < Tensor > ( encodeLength )
113+ this . encodeWindow = new RingBuffer < tf . Tensor > ( encodeLength )
114114 const encodeFill = tf . fill ( [ encodeWidth ] , - 1.0 )
115115 this . encodeWindow . fill ( encodeFill )
116116 } else {
@@ -126,7 +126,6 @@ export default class WakewordTrigger implements SpeechProcessor {
126126 }
127127
128128 static async loadModels ( baseUrl : string ) : Promise < CommandModels > {
129- const tf = self . tf
130129 return Promise . all ( [
131130 tf . loadGraphModel ( `${ baseUrl } /filter/model.json` ) ,
132131 tf . loadGraphModel ( `${ baseUrl } /encode/model.json` ) ,
@@ -161,32 +160,32 @@ export default class WakewordTrigger implements SpeechProcessor {
161160 }
162161 }
163162
164- async filter ( context : SpeechContext ) {
163+ filter ( context : SpeechContext ) {
165164 const frame = this . sampleWindow . toArray ( )
166- const filtered = this . models . filter . execute ( [ self . tf . stack ( frame ) ] ) as Tensor
165+ const filtered = this . models . filter . execute ( tf . stack ( frame ) )
167166 this . frameWindow . rewind ( ) . seek ( 1 )
168- this . frameWindow . write ( filtered )
169- await this . encode ( context )
167+ this . frameWindow . write ( Array . isArray ( filtered ) ? filtered [ 0 ] : filtered )
168+ return this . encode ( context )
170169 }
171170
172171 async encode ( context : SpeechContext ) {
173- const tf = self . tf
174172 const filtered = this . frameWindow . toArray ( )
175173 const stacked = tf . stack ( filtered )
176174 const input = [ tf . expandDims ( stacked ) , this . encodeState ]
177- const result = ( await this . models . encode . executeAsync ( input ) ) as Tensor [ ]
175+ const result = ( await this . models . encode . executeAsync ( input ) ) as tf . Tensor [ ]
176+ console . log ( JSON . stringify ( result ) )
178177 this . encodeWindow . rewind ( ) . seek ( 1 )
179178 this . encodeWindow . write ( tf . squeeze ( result [ 0 ] ) )
180179 this . encodeState = result [ 1 ]
181- await this . detect ( context )
180+ return this . detect ( context )
182181 }
183182
184- async detect ( context : SpeechContext ) {
185- const tf = self . tf
183+ detect ( context : SpeechContext ) {
186184 const encoded = this . encodeWindow . toArray ( )
187185 const stacked = tf . stack ( encoded )
188186 const input = tf . expandDims ( stacked )
189- const detected = this . models . detect . execute ( [ input ] ) as Tensor
187+ const result = this . models . detect . execute ( input )
188+ const detected = Array . isArray ( result ) ? result [ 0 ] : result
190189 const confidence = tf . max ( detected ) . dataSync ( ) [ 0 ]
191190
192191 // console.log(`wakeword: ${confidence.toFixed(6)}`)
@@ -205,7 +204,6 @@ export default class WakewordTrigger implements SpeechProcessor {
205204 }
206205
207206 reset ( ) {
208- const tf = self . tf
209207 this . sampleWindow . reset ( )
210208
211209 const frameFill = tf . zeros ( [ this . config . melWidth ] )
0 commit comments