Skip to content

Commit a66c8cd

Browse files
committed
优化计算图占用内存。
1 parent ad0be11 commit a66c8cd

File tree

3 files changed

+26
-42
lines changed

3 files changed

+26
-42
lines changed

sense-voice/csrc/sense-voice-decoder.cc

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ struct ggml_cgraph *sense_voice_build_graph_ctc_decoder(sense_voice_context &ctx
6060
ggml_cgraph *gf = ggml_new_graph_custom(ctx0, SENSEVOICE_DECODER_MAX_NODES, false);
6161

6262
ggml_tensor *encoder_out = ggml_new_tensor_3d(ctx0, state.encoder_out->type,
63-
state.encoder_out->ne[0], state.encoder_out->ne[1],
63+
state.encoder_out->ne[0], state.encoder_out->ne[1],
6464
state.encoder_out->ne[2]);
6565
ggml_set_name(encoder_out, "encoder_out");
6666
ggml_set_input(encoder_out);
@@ -125,13 +125,18 @@ bool sense_voice_decode_internal(sense_voice_context &ctx,
125125
}
126126
else {
127127
const int32_t n_logits = argmax_logit->ne[0] * argmax_logit->ne[1];
128-
// Get the tensor data into a temporary buffer
129-
std::vector<int> temp_buffer(n_logits);
130-
ggml_backend_tensor_get(argmax_logit, temp_buffer.data(), 0, sizeof(int) * n_logits);
128+
// Use state->ids as temporary buffer if it's large enough, avoiding extra allocation
129+
if (state.ids.size() < n_logits) {
130+
state.ids.resize(n_logits);
131+
}
132+
ggml_backend_tensor_get(argmax_logit, state.ids.data(), 0, sizeof(int) * n_logits);
133+
131134
for(int32_t i = 0; i < argmax_logit->ne[1]; i++)
132135
{
133136
int posL = i * argmax_logit->ne[0];
134-
state.result_all[state.segmentIDs[i]].tokens = std::vector<int>(temp_buffer.begin() + posL, temp_buffer.begin() + posL + argmax_logit->ne[0]);
137+
// Direct assignment without creating temporary vector
138+
auto& tokens = state.result_all[state.segmentIDs[i]].tokens;
139+
tokens.assign(state.ids.begin() + posL, state.ids.begin() + posL + argmax_logit->ne[0]);
135140
}
136141
}
137142
}
@@ -141,4 +146,4 @@ bool sense_voice_decode_internal(sense_voice_context &ctx,
141146
state.t_decode_us += ggml_time_us() - t_start_us;
142147

143148
return true;
144-
}
149+
}

sense-voice/csrc/sense-voice-encoder.cc

Lines changed: 11 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
#include <map>
99
#include <string>
1010
#include <vector>
11-
#define SENSE_VOICE_ENCODER_MAX_NODES 8192
11+
#define SENSE_VOICE_ENCODER_MAX_NODES 6144
1212
#define WARP_SIZE 32
1313
// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
1414
// the idea is to represent the original matrix multiplication:
@@ -83,13 +83,7 @@ static struct ggml_tensor *encoder_layer_sanm_forward(const sense_voice_hparams
8383
const int n_head = hparams.n_encoder_attention_heads;
8484
auto state = sctx.state;
8585

86-
struct ggml_tensor *residual = nullptr;
87-
88-
if (layer.e_norm_w1->ne[0] == layer.e_norm_w2->ne[0]) {
89-
residual = ggml_cpy(
90-
ctx0, cur,
91-
ggml_new_tensor_3d(ctx0, cur->type, cur->ne[0], cur->ne[1], cur->ne[2]));
92-
}
86+
struct ggml_tensor *residual = cur; // Use original tensor directly instead of copying
9387

9488
{
9589
// layer norm
@@ -174,22 +168,10 @@ static struct ggml_tensor *encoder_layer_sanm_forward(const sense_voice_hparams
174168
// same in pytorch : F.conv1d(input, weight, bias=None, stride=1, padding=1, dilation=1, groups=n_state)
175169
struct ggml_tensor * a = layer.e_attn_fsmn_w;
176170
struct ggml_tensor * b = ggml_cont(ctx0, ggml_transpose(ctx0, V));
177-
// Process each batch separately and concatenate results
178-
// for (int i = 0; i < b->ne[2]; i++) {
179-
// // View for current batch
180-
// struct ggml_tensor *b_batch = ggml_view_3d(ctx0, b, b->ne[0], b->ne[1], 1, b->nb[1], b->nb[2], i * b->nb[2]);
181-
// struct ggml_tensor *im2col = ggml_im2col(ctx0, a, ggml_reshape_4d(ctx0, b_batch, b_batch->ne[0], 1, b_batch->ne[1], b_batch->ne[2] * b_batch->ne[3]), 1, 0, padding, 0, 1, 0, false, GGML_TYPE_F32);
182-
// struct ggml_tensor * result = ggml_mul_mat(ctx0, a, im2col);
183-
// struct ggml_tensor * fsmn_memory_batch = ggml_reshape_3d(ctx0, result, im2col->ne[1], b_batch->ne[1], b_batch->ne[2]);
184-
// if (fsmn_memory == nullptr) {
185-
// fsmn_memory = fsmn_memory_batch;
186-
// } else {
187-
// fsmn_memory = ggml_concat(ctx0, fsmn_memory, fsmn_memory_batch, 2);
188-
// }
189-
// }
171+
190172
struct ggml_tensor * im2col = ggml_im2col(ctx0, a, ggml_reshape_4d(ctx0, b, b->ne[0], 1, b->ne[1] * b->ne[2], b->ne[3]), 1, 0, padding, 0, 1, 0, false, GGML_TYPE_F32);
191173
im2col = ggml_reshape_4d(ctx0, im2col, im2col->ne[0], im2col->ne[1], im2col->ne[2] / n_batch, n_batch);
192-
a = ggml_repeat(ctx0, ggml_cast(ctx0, a, GGML_TYPE_F32), ggml_new_tensor_4d(ctx0, GGML_TYPE_F16, a->ne[0], a->ne[1], a->ne[2], n_batch));
174+
// a = ggml_repeat(ctx0, ggml_cast(ctx0, a, GGML_TYPE_F32), ggml_new_tensor_4d(ctx0, GGML_TYPE_F16, a->ne[0], a->ne[1], a->ne[2], n_batch));
193175
struct ggml_tensor * result = ggml_mul_mat(ctx0, a, im2col);
194176
fsmn_memory = ggml_reshape_3d(ctx0, result, im2col->ne[1], im2col->ne[2], im2col->ne[3]);
195177
}
@@ -227,8 +209,7 @@ static struct ggml_tensor *encoder_layer_sanm_forward(const sense_voice_hparams
227209
} else{
228210
// K * Q
229211
struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K_h, Q_h);
230-
231-
struct ggml_tensor *KQ_soft_max = ggml_soft_max_ext(ctx0, KQ, nullptr, KQscale, 0.0f);
212+
struct ggml_tensor *KQ_soft_max = ggml_soft_max_inplace(ctx0, ggml_scale_inplace(ctx0, KQ, KQscale));
232213

233214

234215
ggml_tensor *KQV = ggml_mul_mat(
@@ -250,26 +231,24 @@ static struct ggml_tensor *encoder_layer_sanm_forward(const sense_voice_hparams
250231
}
251232
}
252233

253-
residual = ggml_cpy(
254-
ctx0, cur,
255-
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, cur->ne[0], cur->ne[1], cur->ne[2]));
234+
ggml_tensor *residual2 = cur;
256235
{
257236
// layer norm after attention
258237
// cur = ln_0_w*cur + ln_0_b
259238
cur = ggml_norm(ctx0, cur, hparams.eps);
260-
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.e_norm_w2), layer.e_norm_b2);
239+
cur = ggml_add_inplace(ctx0, ggml_mul_inplace(ctx0, cur, layer.e_norm_w2), layer.e_norm_b2);
261240
}
262241

263242
{
264243
// position-wise feed forward layer
265-
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.e_mlp_w1, cur),
244+
cur = ggml_add_inplace(ctx0, ggml_mul_mat(ctx0, layer.e_mlp_w1, cur),
266245
layer.e_mlp_b1);
267-
cur = ggml_relu(ctx0, cur);
268-
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.e_mlp_w2, cur),
246+
cur = ggml_relu_inplace(ctx0, cur);
247+
cur = ggml_add_inplace(ctx0, ggml_mul_mat(ctx0, layer.e_mlp_w2, cur),
269248
layer.e_mlp_b2);
270249
}
271250
// residual after position wise feed forward
272-
cur = ggml_add(ctx0, cur, residual);
251+
cur = ggml_add_inplace(ctx0, cur, residual2);
273252
return cur;
274253

275254
}

sense-voice/csrc/sense-voice.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
#include <functional>
1212
#include <thread>
1313

14-
#define SENSE_VOICE_MAX_NODES 8192
14+
#define SENSE_VOICE_MAX_NODES 6144
1515
#define SENSE_VOICE_MAX_DECODERS 8
1616
#define SENSE_VOICE_CHUNK_SIZE 20
1717
#define SENSE_VOICE_FEATURES_DIM 560
@@ -697,7 +697,7 @@ int sense_voice_pcm_to_feature_with_state(struct sense_voice_context *ctx,
697697
state->feature.buffer = nullptr;
698698
}
699699
state->feature.tensor = nullptr;
700-
700+
701701
// init features
702702
state->feature.n_len = state->feature.data.size() / (state->feature.n_mel * state->feature.lfr_m);
703703
state->feature.ctx = ggml_init({ggml_tensor_overhead(), nullptr, true});
@@ -837,7 +837,7 @@ int sense_voice_batch_pcm_to_feature_with_state(struct sense_voice_context *ctx,
837837
state->feature.buffer = nullptr;
838838
}
839839
state->feature.tensor = nullptr;
840-
840+
841841
// init features
842842
state->feature.n_len = state->feature.data.size() / (state->feature.n_mel * state->feature.lfr_m);
843843
state->feature.ctx = ggml_init({ggml_tensor_overhead(), nullptr, true});
@@ -916,7 +916,7 @@ int sense_voice_batch_full(struct sense_voice_context *ctx, const sense_voice_fu
916916

917917
int sense_voice_batch_pcmf(struct sense_voice_context *ctx, const sense_voice_full_params &params, std::vector<std::vector<float>> &pcmf32,
918918
size_t max_batch_len, size_t max_batch_cnt,
919-
bool use_prefix, bool use_itn)
919+
bool use_prefix, bool use_itn)
920920
{
921921
// 还是要有ctx,重复生成会重复读取模型,有点耗性能
922922
// ctx中的参数需要在外面赋值,外面的参数形态各异,带不进来

0 commit comments

Comments
 (0)