88#include < map>
99#include < string>
1010#include < vector>
11- #define SENSE_VOICE_ENCODER_MAX_NODES 8192
11+ #define SENSE_VOICE_ENCODER_MAX_NODES 6144
1212#define WARP_SIZE 32
1313// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
1414// the idea is to represent the original matrix multiplication:
@@ -83,13 +83,7 @@ static struct ggml_tensor *encoder_layer_sanm_forward(const sense_voice_hparams
8383 const int n_head = hparams.n_encoder_attention_heads ;
8484 auto state = sctx.state ;
8585
86- struct ggml_tensor *residual = nullptr ;
87-
88- if (layer.e_norm_w1 ->ne [0 ] == layer.e_norm_w2 ->ne [0 ]) {
89- residual = ggml_cpy (
90- ctx0, cur,
91- ggml_new_tensor_3d (ctx0, cur->type , cur->ne [0 ], cur->ne [1 ], cur->ne [2 ]));
92- }
86+ struct ggml_tensor *residual = cur; // Use original tensor directly instead of copying
9387
9488 {
9589 // layer norm
@@ -174,22 +168,10 @@ static struct ggml_tensor *encoder_layer_sanm_forward(const sense_voice_hparams
174168 // same in pytorch : F.conv1d(input, weight, bias=None, stride=1, padding=1, dilation=1, groups=n_state)
175169 struct ggml_tensor * a = layer.e_attn_fsmn_w ;
176170 struct ggml_tensor * b = ggml_cont (ctx0, ggml_transpose (ctx0, V));
177- // Process each batch separately and concatenate results
178- // for (int i = 0; i < b->ne[2]; i++) {
179- // // View for current batch
180- // struct ggml_tensor *b_batch = ggml_view_3d(ctx0, b, b->ne[0], b->ne[1], 1, b->nb[1], b->nb[2], i * b->nb[2]);
181- // struct ggml_tensor *im2col = ggml_im2col(ctx0, a, ggml_reshape_4d(ctx0, b_batch, b_batch->ne[0], 1, b_batch->ne[1], b_batch->ne[2] * b_batch->ne[3]), 1, 0, padding, 0, 1, 0, false, GGML_TYPE_F32);
182- // struct ggml_tensor * result = ggml_mul_mat(ctx0, a, im2col);
183- // struct ggml_tensor * fsmn_memory_batch = ggml_reshape_3d(ctx0, result, im2col->ne[1], b_batch->ne[1], b_batch->ne[2]);
184- // if (fsmn_memory == nullptr) {
185- // fsmn_memory = fsmn_memory_batch;
186- // } else {
187- // fsmn_memory = ggml_concat(ctx0, fsmn_memory, fsmn_memory_batch, 2);
188- // }
189- // }
171+
190172 struct ggml_tensor * im2col = ggml_im2col (ctx0, a, ggml_reshape_4d (ctx0, b, b->ne [0 ], 1 , b->ne [1 ] * b->ne [2 ], b->ne [3 ]), 1 , 0 , padding, 0 , 1 , 0 , false , GGML_TYPE_F32);
191173 im2col = ggml_reshape_4d (ctx0, im2col, im2col->ne [0 ], im2col->ne [1 ], im2col->ne [2 ] / n_batch, n_batch);
192- a = ggml_repeat (ctx0, ggml_cast (ctx0, a, GGML_TYPE_F32), ggml_new_tensor_4d (ctx0, GGML_TYPE_F16, a->ne [0 ], a->ne [1 ], a->ne [2 ], n_batch));
174+ // a = ggml_repeat(ctx0, ggml_cast(ctx0, a, GGML_TYPE_F32), ggml_new_tensor_4d(ctx0, GGML_TYPE_F16, a->ne[0], a->ne[1], a->ne[2], n_batch));
193175 struct ggml_tensor * result = ggml_mul_mat (ctx0, a, im2col);
194176 fsmn_memory = ggml_reshape_3d (ctx0, result, im2col->ne [1 ], im2col->ne [2 ], im2col->ne [3 ]);
195177 }
@@ -227,8 +209,7 @@ static struct ggml_tensor *encoder_layer_sanm_forward(const sense_voice_hparams
227209 } else {
228210 // K * Q
229211 struct ggml_tensor *KQ = ggml_mul_mat (ctx0, K_h, Q_h);
230-
231- struct ggml_tensor *KQ_soft_max = ggml_soft_max_ext (ctx0, KQ, nullptr , KQscale, 0 .0f );
212+ struct ggml_tensor *KQ_soft_max = ggml_soft_max_inplace (ctx0, ggml_scale_inplace (ctx0, KQ, KQscale));
232213
233214
234215 ggml_tensor *KQV = ggml_mul_mat (
@@ -250,26 +231,24 @@ static struct ggml_tensor *encoder_layer_sanm_forward(const sense_voice_hparams
250231 }
251232 }
252233
253- residual = ggml_cpy (
254- ctx0, cur,
255- ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, cur->ne [0 ], cur->ne [1 ], cur->ne [2 ]));
234+ ggml_tensor *residual2 = cur;
256235 {
257236 // layer norm after attention
258237 // cur = ln_0_w*cur + ln_0_b
259238 cur = ggml_norm (ctx0, cur, hparams.eps );
260- cur = ggml_add (ctx0, ggml_mul (ctx0, cur, layer.e_norm_w2 ), layer.e_norm_b2 );
239+ cur = ggml_add_inplace (ctx0, ggml_mul_inplace (ctx0, cur, layer.e_norm_w2 ), layer.e_norm_b2 );
261240 }
262241
263242 {
264243 // position-wise feed forward layer
265- cur = ggml_add (ctx0, ggml_mul_mat (ctx0, layer.e_mlp_w1 , cur),
244+ cur = ggml_add_inplace (ctx0, ggml_mul_mat (ctx0, layer.e_mlp_w1 , cur),
266245 layer.e_mlp_b1 );
267- cur = ggml_relu (ctx0, cur);
268- cur = ggml_add (ctx0, ggml_mul_mat (ctx0, layer.e_mlp_w2 , cur),
246+ cur = ggml_relu_inplace (ctx0, cur);
247+ cur = ggml_add_inplace (ctx0, ggml_mul_mat (ctx0, layer.e_mlp_w2 , cur),
269248 layer.e_mlp_b2 );
270249 }
271250 // residual after position wise feed forward
272- cur = ggml_add (ctx0, cur, residual );
251+ cur = ggml_add_inplace (ctx0, cur, residual2 );
273252 return cur;
274253
275254}
0 commit comments