diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp index c8f1b5ec90f..330c339a4f3 100644 --- a/src/models/qwen3next.cpp +++ b/src/models/qwen3next.cpp @@ -3,6 +3,241 @@ #define CHUNK_SIZE 64 +#include +#include +#include +#include +#include +#include +#include +#include +struct PerformanceMetrics { +private: + std::chrono::high_resolution_clock::time_point start_time; + std::unordered_map op_times; // Total time spent on each operation + std::unordered_map op_counts; // Number of times each operation was called + std::vector operation_history; // Keeps track of recent operations + +public: + // Start timing an operation + void start_op(const std::string& op_name) { + start_time = std::chrono::high_resolution_clock::now(); + operation_history.push_back(op_name); + } + + // Stop timing an operation + void end_op(const std::string& op_name) { + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end_time - start_time); + double duration_ms = duration.count() / 1000.0; + + op_times[op_name] += duration_ms; + op_counts[op_name] += 1; + } + + // Get the average time of a specific operation + double get_avg_time(const std::string& op_name) const { + auto it = op_counts.find(op_name); + if (it != op_counts.end() && it->second > 0) { + return op_times.at(op_name) / it->second; + } + return 0.0; + } + + // Get the total time of all operations + double get_total_time() const { + double total = 0.0; + for (const auto& pair : op_times) { + total += pair.second; + } + return total; + } + + // Print performance report to console + void print_report() const { + print_report(std::cout); + } + + // Print performance report to specified output stream + void print_report(std::ostream& out) const { + double total_time = get_total_time(); + + out << "\n" << std::string(60, '=') << "\n"; + out << " Performance Report \n"; + out << std::string(60, '=') << "\n"; + out << std::left; + out << std::setw(25) << "Operation Name" + << std::setw(12) << "Total(ms)" + << std::setw(10) << "Calls" + << std::setw(15) << "Avg(ms)" + << std::setw(8) << "Pct(%)" << "\n"; + out << std::string(60, '-') << "\n"; + + // Create a list of operations and sort them by total time + std::vector>> ops; + for (const auto& pair : op_times) { + ops.emplace_back(pair.first, + std::make_pair(pair.second, op_counts.at(pair.first))); + } + + std::sort(ops.begin(), ops.end(), + [](const auto& a, const auto& b) { + return a.second.first > b.second.first; + }); + + for (const auto& op : ops) { + const std::string& name = op.first; + double total = op.second.first; + size_t count = op.second.second; + double avg = total / count; + double percentage = total_time > 0 ? (total / total_time) * 100.0 : 0.0; + + out << std::setw(25) << name + << std::setw(12) << std::fixed << std::setprecision(3) << total + << std::setw(10) << count + << std::setw(15) << std::setprecision(3) << avg + << std::setw(8) << std::setprecision(2) << percentage << "\n"; + } + + out << std::string(60, '-') << "\n"; + out << std::setw(25) << "Total" + << std::setw(12) << std::fixed << std::setprecision(3) << total_time + << std::setw(33) << "100.00" << "\n"; + out << std::string(60, '=') << "\n"; + + // Identify the heaviest operations + if (!ops.empty()) { + out << "\nPerformance Bottleneck Analysis:\n"; + out << "- Most time-consuming operation: " << ops[0].first + << " (" << std::fixed << std::setprecision(3) << ops[0].second.first << " ms)\n"; + out << "- Longest average time: "; + + double max_avg = 0.0; + std::string max_avg_op; + for (const auto& op : ops) { + double avg = op.second.first / op.second.second; + if (avg > max_avg) { + max_avg = avg; + max_avg_op = op.first; + } + } + out << max_avg_op << " (" << std::fixed << std::setprecision(3) << max_avg << " ms)\n"; + } + + out << std::right << std::endl; + } + + // Export performance data to CSV + void export_csv(const std::string& filename) const { + std::ofstream file(filename); + if (!file.is_open()) { + std::cerr << "Unable to create file: " << filename << std::endl; + return; + } + + file << "Operation,Total(ms),Calls,Avg(ms),Pct(%)\n"; + double total_time = get_total_time(); + + std::vector>> ops; + for (const auto& pair : op_times) { + ops.emplace_back(pair.first, + std::make_pair(pair.second, op_counts.at(pair.first))); + } + + std::sort(ops.begin(), ops.end(), + [](const auto& a, const auto& b) { + return a.second.first > b.second.first; + }); + + for (const auto& op : ops) { + const std::string& name = op.first; + double total = op.second.first; + size_t count = op.second.second; + double avg = total / count; + double percentage = total_time > 0 ? (total / total_time) * 100.0 : 0.0; + + file << name << "," + << std::fixed << std::setprecision(3) << total << "," + << count << "," + << std::setprecision(3) << avg << "," + << std::setprecision(2) << percentage << "\n"; + } + + file.close(); + std::cout << "Performance data exported to: " << filename << std::endl; + } + + // Print a realtime summary every 5 seconds + void print_realtime_summary() const { + static auto last_print = std::chrono::high_resolution_clock::now(); + static std::unordered_map last_times; + + auto now = std::chrono::high_resolution_clock::now(); + auto time_diff = std::chrono::duration_cast(now - last_print); + + if (time_diff.count() >= 5) { // Print every 5 seconds + std::cout << "\n[Performance Monitor] Realtime Summary (updated every 5s):\n"; + std::cout << "Total Time: " << std::fixed << std::setprecision(2) + << get_total_time() << " ms\n"; + std::cout << "Number of operations: " << op_times.size() << "\n"; + + std::cout << "Recent operations: "; + size_t recent_count = std::min(operation_history.size(), size_t(5)); + for (size_t i = operation_history.size() - recent_count; i < operation_history.size(); ++i) { + std::cout << operation_history[i]; + if (i < operation_history.size() - 1) std::cout << " -> "; + } + std::cout << "\n" << std::string(50, '-') << "\n"; + + last_print = now; + } + } + + // Reset all performance data + void reset() { + op_times.clear(); + op_counts.clear(); + operation_history.clear(); + } + + // Get raw performance data for further analysis + std::unordered_map get_performance_data() const { + return op_times; + } + + std::unordered_map get_operation_counts() const { + return op_counts; + } +}; + +// Global performance monitor +static PerformanceMetrics perf_metrics; + +// Macros for convenience +#define PERF_START(op_name) perf_metrics.start_op(op_name) +#define PERF_END(op_name) perf_metrics.end_op(op_name) + +/* +Usage Example: + + PERF_START("model_init"); + // Model initialization code + PERF_END("model_init"); + + PERF_START("delta_net_chunking"); + ggml_tensor * result = build_delta_net_chunking(q, k, v, g, beta, state, causal_mask, identity, il); + PERF_END("delta_net_chunking"); + + PERF_START("attention_computation"); + // Attention computation + PERF_END("attention_computation"); + + // Print report + perf_metrics.print_report(); + perf_metrics.export_csv("performance_report.csv"); +*/ + + llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params), model(model) { ggml_tensor * cur; @@ -16,30 +251,41 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * causal_mask = ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, ubatch.n_seq_tokens, ubatch.n_seq_tokens), 1.0f), GGML_TRI_TYPE_LOWER); - ggml_tensor * identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, ubatch.n_seq_tokens), 1.0f)); + + ggml_tensor * identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, ubatch.n_seq_tokens), 1.0f)); + ggml_build_forward_expand(gf, causal_mask); + ggml_build_forward_expand(gf, identity); + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); + // Determine layer type and build appropriate attention mechanism if (hparams.is_recurrent(il)) { // Linear attention layer (gated delta net) +PERF_START("attn_calc_linear"); cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, il); +PERF_END("attn_calc_linear"); } else { // Full attention layer + PERF_START("attn_calc_full"); cur = build_layer_attn(inp->get_attn(), cur, inp_pos, il); + PERF_END("attn_calc_full"); } - + PERF_END("attn_calc"); + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); @@ -56,8 +302,10 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr ggml_tensor * attn_post_norm = build_norm(cur, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il); cb(attn_post_norm, "attn_post_norm", il); +PERF_START("ffn_calc"); // FFN layer (MoE or dense) - without residual connection cur = build_layer_ffn(attn_post_norm, il); +PERF_END("ffn_calc"); cb(cur, "ffn_out", il); // Residual connection for FFN - add to the tensor from before post_attention_layernorm @@ -67,6 +315,8 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr // Input for next layer inpL = cur; } + + cur = inpL; // Final norm @@ -81,7 +331,10 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr cb(cur, "result_output", -1); res->t_logits = cur; +PERF_START("fwd_expand"); ggml_build_forward_expand(gf, cur); +PERF_END("fwd_expand"); + perf_metrics.print_report(); } ggml_tensor * llm_build_qwen3next::build_delta_net_chunking( @@ -94,6 +347,7 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking( ggml_tensor * causal_mask, ggml_tensor * identity, int il) { + PERF_START("build_dn_chunking"); GGML_ASSERT(ggml_is_contiguous(q)); GGML_ASSERT(ggml_is_contiguous(k)); GGML_ASSERT(ggml_is_contiguous(v)); @@ -358,6 +612,7 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking( ggml_tensor * flat_state = ggml_cont_1d(ctx0, new_state, S_v * S_v * H_v * n_seqs); +PERF_END("build_dn_chunking"); return ggml_concat(ctx0, flat_output, flat_state, 0); } @@ -371,6 +626,7 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent( ggml_tensor * causal_mask, ggml_tensor * identity, int il) { + PERF_START("build_dn_recur"); GGML_ASSERT(ggml_is_contiguous(q)); GGML_ASSERT(ggml_is_contiguous(k)); GGML_ASSERT(ggml_is_contiguous(v)); @@ -444,8 +700,9 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent( ggml_tensor * v_beta = ggml_mul(ctx0, v, beta); ggml_tensor * k_beta = ggml_mul(ctx0, k, beta); + PERF_START("cumsum_dn"); ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g); - + PERF_END("cumsum_dn"); cb(k_beta, "k_beta", il); cb(v_beta, "v_beta", il); cb(g_cumsum, "g_cumsum", il); @@ -458,9 +715,12 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent( // ggml_repeat_4d(ctx0, gcs_i, GGML_DELTA_NET_CHUNK, GGML_DELTA_NET_CHUNK, num_chunks * H_v, // n_seqs); // [chunk_size, 1, H_v, n_seqs] -> [chunk_size, chunk_size, H_v, n_seqs] // Don't need this, this one will get auto-broadcast + PERF_START("broadcast"); ggml_tensor * gcs_j_broadcast = ggml_repeat_4d(ctx0, gcs_j, n_tokens, n_tokens, H_v, n_seqs); // [1, chunk_size, H_v, n_seqs] -> [chunk_size, chunk_size, H_v, n_seqs] - + PERF_END("broadcast"); + + PERF_START("decay_mask"); ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i); // Apply lower triangular mask to ensure attention is causal (only past tokens influence current) @@ -469,7 +729,7 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent( decay_mask = ggml_exp(ctx0, decay_mask); // Apply lower triangular mask again to ensure only lower triangular values remain decay_mask = ggml_mul(ctx0, decay_mask, causal_diag_mask); - + PERF_END("decay_mask"); cb(decay_mask, "decay_mask", il); // attn = -((k_beta @ key.transpose(-1, -2)) * decay_mask).masked_fill(mask, 0) @@ -492,18 +752,24 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent( ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask); ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower); + PERF_START("solve_tri_recur"); ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false); + PERF_END("solve_tri_recur"); attn = ggml_mul(ctx0, lin_solve, causal_mask); attn = ggml_add(ctx0, attn, identity); // value = attn @ v_beta + PERF_START("value_mul_mat"); v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn); - + PERF_END("value_mul_mat"); + cb(v, "value_beta", il); // k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1)) ggml_tensor * g_cumsum_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cumsum)); + PERF_START("ggml_exp"); ggml_tensor * gexp = ggml_exp(ctx0, g_cumsum_t); + PERF_END("ggml_exp"); cb(gexp, "g_cum_exp", il); @@ -607,6 +873,7 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent( ggml_tensor * flat_state = ggml_cont_1d(ctx0, state, S_v * S_v * H_v * n_seqs); +PERF_END("build_dn_recur"); return ggml_concat(ctx0, flat_output, flat_state, 0); } @@ -862,7 +1129,9 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear( cb(conv_states_all, "conv_states_updated", il); // Apply SSM convolution + PERF_START("ssm_conv"); ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel); + PERF_END("ssm_conv"); cb(conv_output_proper, "conv_output_raw", il); conv_output_proper = ggml_cont(ctx0, ggml_transpose(ctx0, conv_output_proper)); @@ -926,9 +1195,11 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear( cb(v_conv, "v_conv_predelta", il); // Choose between build_delta_net_chunking and build_delta_net_recurrent based on n_tokens + PERF_START("build_delta_net"); ggml_tensor * attn_out = n_seq_tokens > CHUNK_SIZE ? build_delta_net_chunking (q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, il) : build_delta_net_recurrent(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, il); + PERF_END("build_delta_net"); cb(attn_out, "attn_out", il); // The tensors were concatenated 1d, so we need to extract them 1d as well @@ -948,11 +1219,19 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear( cb(state_1d, "state_1d", il); // Update the recurrent states - ggml_build_forward_expand(gf, - ggml_cpy(ctx0, state_1d, - ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs, - kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all)))); - + PERF_START("fwd_exp_upd_recur_sts"); +// ggml_build_forward_expand(gf, +// ggml_cpy(ctx0, state_1d, +// ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs, + // kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all)))); +ggml_tensor * state_view = ggml_view_1d( + ctx0, + ssm_states_all, + hparams.n_embd_s() * n_seqs, + kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all) +); +ggml_build_forward_expand(gf, state_view); + PERF_END("fwd_exp_upd_recur_sts"); GGML_ASSERT(ggml_nelements(attn_out_1d) + ggml_nelements(state_1d) == ggml_nelements(attn_out)); // Reshape both attn_out_final and z to 2D tensors for normalization