From 4832cda1b28d09026840255e6c4dfa9e8796a789 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= Date: Fri, 6 Feb 2026 09:33:50 -0800 Subject: [PATCH 1/2] Add profiling instrumentation for NAM building blocks Adds a profiling framework (NAM/profiling.h, NAM/profiling.cpp) with NAM_PROFILE_START()/NAM_PROFILE_ADD() macros and 14 timing categories. Supports both desktop (std::chrono) and ARM Cortex-M7 (DWT cycle counter) backends. Profiling is compile-time gated via -DNAM_PROFILING. Instruments wavenet _Layer::Process() and _LayerArray::ProcessInner() with per-category timing, and adds profiling reset/print calls to the benchmodel tool. Co-Authored-By: Claude Opus 4.6 --- NAM/conv1d.cpp | 4 ++ NAM/dsp.cpp | 4 ++ NAM/film.h | 4 ++ NAM/profiling.cpp | 47 +++++++++++++ NAM/profiling.h | 153 +++++++++++++++++++++++++++++++++++++++++++ NAM/wavenet.cpp | 23 +++++++ tools/benchmodel.cpp | 7 ++ 7 files changed, 242 insertions(+) create mode 100644 NAM/profiling.cpp create mode 100644 NAM/profiling.h diff --git a/NAM/conv1d.cpp b/NAM/conv1d.cpp index 9bbbc02..d440f0c 100644 --- a/NAM/conv1d.cpp +++ b/NAM/conv1d.cpp @@ -1,4 +1,5 @@ #include "conv1d.h" +#include "profiling.h" #include namespace nam @@ -143,6 +144,9 @@ void Conv1D::SetMaxBufferSize(const int maxBufferSize) void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames) { + // Note: Profiling is done at the caller level (e.g., _Layer::Process in wavenet.cpp) + // to avoid double-counting when Conv1D is called from within profiled blocks. + // Write input to ring buffer _input_buffer.Write(input, num_frames); diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp index 05dab09..b644af3 100644 --- a/NAM/dsp.cpp +++ b/NAM/dsp.cpp @@ -8,6 +8,7 @@ #include #include "dsp.h" +#include "profiling.h" #include "registry.h" #define tanh_impl_ std::tanh @@ -443,6 +444,9 @@ Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int nu void nam::Conv1x1::process_(const Eigen::Ref& input, const int num_frames) { + // Note: Profiling is done at the caller level (e.g., _Layer::Process in wavenet.cpp) + // to provide meaningful categories (input_mixin, layer1x1, head1x1, rechannel) + // rather than generic conv1x1. assert(num_frames <= _output.cols()); if (this->_is_depthwise) diff --git a/NAM/film.h b/NAM/film.h index f0f86fb..eeb750a 100644 --- a/NAM/film.h +++ b/NAM/film.h @@ -81,9 +81,13 @@ class FiLM assert(num_frames <= condition.cols()); assert(num_frames <= _output.cols()); + // Conv1x1 to compute scale/shift from condition _cond_to_scale_shift.process_(condition, num_frames); const auto& scale_shift = _cond_to_scale_shift.GetOutput(); + // Note: FiLM time is included in the caller's profiling category (e.g., conv1d, input_mixin) + // rather than tracked separately, to avoid double-counting. + const auto scale = scale_shift.topRows(get_input_dim()).leftCols(num_frames); if (_do_shift) { diff --git a/NAM/profiling.cpp b/NAM/profiling.cpp new file mode 100644 index 0000000..65d430e --- /dev/null +++ b/NAM/profiling.cpp @@ -0,0 +1,47 @@ +#include "profiling.h" + +#ifdef NAM_PROFILING + +#if defined(__ARM_ARCH_7EM__) || defined(ARM_MATH_CM7) +// ARM Cortex-M7: Use DWT cycle counter for precise timing +#include "stm32h7xx.h" + +namespace nam { +namespace profiling { + +Timings g_timings; + +// CPU frequency in MHz (Daisy runs at 480 MHz) +static constexpr uint32_t CPU_FREQ_MHZ = 480; + +uint32_t get_time_us() { + // DWT->CYCCNT gives cycle count + // Divide by CPU_FREQ_MHZ to get microseconds + return DWT->CYCCNT / CPU_FREQ_MHZ; +} + +} // namespace profiling +} // namespace nam + +#else +// Non-ARM: Use std::chrono for timing (for testing on desktop) +#include + +namespace nam { +namespace profiling { + +Timings g_timings; + +uint32_t get_time_us() { + using namespace std::chrono; + static auto start = high_resolution_clock::now(); + auto now = high_resolution_clock::now(); + return (uint32_t)duration_cast(now - start).count(); +} + +} // namespace profiling +} // namespace nam + +#endif // ARM check + +#endif // NAM_PROFILING diff --git a/NAM/profiling.h b/NAM/profiling.h new file mode 100644 index 0000000..71031fe --- /dev/null +++ b/NAM/profiling.h @@ -0,0 +1,153 @@ +#pragma once + +// Comprehensive profiling for NAM building blocks +// Enable with -DNAM_PROFILING +// +// Usage: +// 1. Call nam::profiling::reset() before benchmark +// 2. Run model processing +// 3. Call nam::profiling::print_results() to display breakdown +// +// Categories cover all WaveNet operations including FiLM modulation. + +#ifdef NAM_PROFILING + +#include +#include + +namespace nam { +namespace profiling { + +// Timing accumulators (in microseconds) +struct Timings { + // Dilated convolution (Conv1D) + uint32_t conv1d = 0; + + // Pointwise convolutions (Conv1x1 variants) + uint32_t input_mixin = 0; // Input mixing Conv1x1 + uint32_t layer1x1 = 0; // Layer 1x1 (residual projection) + uint32_t head1x1 = 0; // Head 1x1 (skip connection projection) + uint32_t rechannel = 0; // Rechannel Conv1x1 (input/output) + uint32_t conv1x1 = 0; // Other Conv1x1 (catch-all for non-WaveNet uses) + + // Activation + uint32_t activation = 0; // Activation functions (tanh, ReLU, Softsign, etc.) + + // FiLM modulation + uint32_t film = 0; // Feature-wise Linear Modulation (scale/shift) + + // Memory operations + uint32_t copies = 0; // Memory copies and additions + uint32_t setzero = 0; // setZero() calls + uint32_t ringbuf = 0; // Ring buffer operations (Write, Read, Advance) + + // Conditioning + uint32_t condition = 0; // Condition DSP processing + + // LSTM (for LSTM models) + uint32_t lstm = 0; // LSTM cell computations + + // Catch-all + uint32_t other = 0; // Everything else + + void reset() { + conv1d = 0; + input_mixin = 0; + layer1x1 = 0; + head1x1 = 0; + rechannel = 0; + conv1x1 = 0; + activation = 0; + film = 0; + copies = 0; + setzero = 0; + ringbuf = 0; + condition = 0; + lstm = 0; + other = 0; + } + + uint32_t total() const { + return conv1d + input_mixin + layer1x1 + head1x1 + rechannel + conv1x1 + activation + film + copies + setzero + ringbuf + condition + lstm + other; + } +}; + +// Global timing accumulator +extern Timings g_timings; + +// Get current time in microseconds (platform-specific) +uint32_t get_time_us(); + +// Reset profiling counters +inline void reset() { g_timings.reset(); } + +// Print profiling results to stdout +inline void print_results() { + const auto& t = g_timings; + uint32_t total = t.total(); + + printf("\nProfiling breakdown:\n"); + printf("%-12s %8s %6s\n", "Category", "Time(ms)", "%%"); + printf("%-12s %8s %6s\n", "--------", "--------", "----"); + + auto print_row = [total](const char* name, uint32_t us) { + if (us > 0 || total == 0) { + uint32_t pct = total > 0 ? (us * 100 / total) : 0; + printf("%-12s %8.1f %5lu%%\n", name, us / 1000.0f, (unsigned long)pct); + } + }; + + print_row("Conv1D", t.conv1d); + print_row("InputMixin", t.input_mixin); + print_row("Layer1x1", t.layer1x1); + print_row("Head1x1", t.head1x1); + print_row("Rechannel", t.rechannel); + print_row("Conv1x1", t.conv1x1); + print_row("Activation", t.activation); + print_row("FiLM", t.film); + print_row("Copies", t.copies); + print_row("SetZero", t.setzero); + print_row("RingBuf", t.ringbuf); + print_row("Condition", t.condition); + print_row("LSTM", t.lstm); + print_row("Other", t.other); + + printf("%-12s %8s %6s\n", "--------", "--------", "----"); + printf("%-12s %8.1f %5s\n", "Total", total / 1000.0f, "100%"); +} + +// Helper macros for timing sections +// Usage: +// NAM_PROFILE_START(); +// // ... code to profile ... +// NAM_PROFILE_ADD(conv1d); // Adds elapsed time to conv1d, resets timer + +#define NAM_PROFILE_START() uint32_t _prof_start = nam::profiling::get_time_us() +#define NAM_PROFILE_ADD(category) do { \ + uint32_t _prof_now = nam::profiling::get_time_us(); \ + nam::profiling::g_timings.category += (_prof_now - _prof_start); \ + _prof_start = _prof_now; \ +} while(0) + +// Variant that doesn't reset the timer (for one-shot measurements) +#define NAM_PROFILE_ADD_NORESTART(category) \ + nam::profiling::g_timings.category += (nam::profiling::get_time_us() - _prof_start) + +} // namespace profiling +} // namespace nam + +#else // NAM_PROFILING not defined + +// No-op macros when profiling is disabled +#define NAM_PROFILE_START() ((void)0) +#define NAM_PROFILE_ADD(category) ((void)0) +#define NAM_PROFILE_ADD_NORESTART(category) ((void)0) + +namespace nam { +namespace profiling { + inline void reset() {} + inline void print_results() {} +} // namespace profiling +} // namespace nam + +#endif // NAM_PROFILING diff --git a/NAM/wavenet.cpp b/NAM/wavenet.cpp index 7d9b5d0..6112169 100644 --- a/NAM/wavenet.cpp +++ b/NAM/wavenet.cpp @@ -6,6 +6,7 @@ #include #include "get_dsp.h" +#include "profiling.h" #include "registry.h" #include "wavenet.h" @@ -89,6 +90,8 @@ void nam::wavenet::_Layer::set_weights_(std::vector::iterator& weights) void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::MatrixXf& condition, const int num_frames) { + NAM_PROFILE_START(); + const long bottleneck = this->_bottleneck; // Use the actual bottleneck value, not the doubled output channels // Step 1: input convolutions @@ -107,6 +110,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma Eigen::MatrixXf& conv_output = this->_conv.GetOutput(); this->_conv_post_film->Process_(conv_output, condition, num_frames); } + NAM_PROFILE_ADD(conv1d); if (this->_input_mixin_pre_film) { @@ -123,8 +127,12 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma Eigen::MatrixXf& input_mixin_output = this->_input_mixin.GetOutput(); this->_input_mixin_post_film->Process_(input_mixin_output, condition, num_frames); } + NAM_PROFILE_ADD(input_mixin); + this->_z.leftCols(num_frames).noalias() = _conv.GetOutput().leftCols(num_frames) + _input_mixin.GetOutput().leftCols(num_frames); + NAM_PROFILE_ADD(copies); + if (this->_activation_pre_film) { this->_activation_pre_film->Process_(this->_z, condition, num_frames); @@ -139,6 +147,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma if (this->_gating_mode == GatingMode::NONE) { this->_activation->apply(this->_z.leftCols(num_frames)); + NAM_PROFILE_ADD(activation); if (this->_activation_post_film) { this->_activation_post_film->Process_(this->_z, condition, num_frames); @@ -146,6 +155,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma if (this->_layer1x1) { this->_layer1x1->process_(this->_z, num_frames); + NAM_PROFILE_ADD(layer1x1); } } else if (this->_gating_mode == GatingMode::GATED) @@ -155,6 +165,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma auto input_block = this->_z.leftCols(num_frames); auto output_block = this->_z.topRows(bottleneck).leftCols(num_frames); this->_gating_activation->apply(input_block, output_block); + NAM_PROFILE_ADD(activation); if (this->_activation_post_film) { // Use Process() for blocks and copy result back @@ -165,6 +176,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma if (this->_layer1x1) { this->_layer1x1->process_(this->_z.topRows(bottleneck), num_frames); + NAM_PROFILE_ADD(layer1x1); } } else if (this->_gating_mode == GatingMode::BLENDED) @@ -174,6 +186,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma auto input_block = this->_z.leftCols(num_frames); auto output_block = this->_z.topRows(bottleneck).leftCols(num_frames); this->_blending_activation->apply(input_block, output_block); + NAM_PROFILE_ADD(activation); if (this->_activation_post_film) { // Use Process() for blocks and copy result back @@ -184,6 +197,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma if (this->_layer1x1) { this->_layer1x1->process_(this->_z.topRows(bottleneck), num_frames); + NAM_PROFILE_ADD(layer1x1); if (this->_layer1x1_post_film) { Eigen::MatrixXf& layer1x1_output = this->_layer1x1->GetOutput(); @@ -207,6 +221,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma Eigen::MatrixXf& head1x1_output = this->_head1x1->GetOutput(); this->_head1x1_post_film->Process_(head1x1_output, condition, num_frames); } + NAM_PROFILE_ADD(head1x1); this->_output_head.leftCols(num_frames).noalias() = this->_head1x1->GetOutput().leftCols(num_frames); } else // No head 1x1 @@ -230,6 +245,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma // If layer1x1 is inactive, residual connection is just the input (identity) this->_output_next_layer.leftCols(num_frames).noalias() = input.leftCols(num_frames); } + NAM_PROFILE_ADD(copies); } // LayerArray ================================================================= @@ -298,9 +314,12 @@ void nam::wavenet::_LayerArray::Process(const Eigen::MatrixXf& layer_inputs, con void nam::wavenet::_LayerArray::ProcessInner(const Eigen::MatrixXf& layer_inputs, const Eigen::MatrixXf& condition, const int num_frames) { + NAM_PROFILE_START(); + // Process rechannel and get output this->_rechannel.process_(layer_inputs, num_frames); Eigen::MatrixXf& rechannel_output = _rechannel.GetOutput(); + NAM_PROFILE_ADD(rechannel); // Process layers for (size_t i = 0; i < this->_layers.size(); i++) @@ -329,7 +348,11 @@ void nam::wavenet::_LayerArray::ProcessInner(const Eigen::MatrixXf& layer_inputs this->_layers[last_layer].GetOutputNextLayer().leftCols(num_frames); // Process head rechannel +#ifdef NAM_PROFILING + _prof_start = nam::profiling::get_time_us(); // Reset timer for accurate head_rechannel measurement +#endif _head_rechannel.process_(this->_head_inputs, num_frames); + NAM_PROFILE_ADD(rechannel); } diff --git a/tools/benchmodel.cpp b/tools/benchmodel.cpp index 39c14b0..42556f5 100644 --- a/tools/benchmodel.cpp +++ b/tools/benchmodel.cpp @@ -4,6 +4,7 @@ #include "NAM/dsp.h" #include "NAM/get_dsp.h" +#include "NAM/profiling.h" using std::chrono::duration; using std::chrono::duration_cast; @@ -62,6 +63,9 @@ int main(int argc, char* argv[]) outputPtrs[ch] = outputBuffers[ch].data(); } + // Reset profiling counters before benchmark + nam::profiling::reset(); + std::cout << "Running benchmark\n"; auto t1 = high_resolution_clock::now(); for (size_t i = 0; i < numBuffers; i++) @@ -80,6 +84,9 @@ int main(int argc, char* argv[]) std::cout << ms_int.count() << "ms\n"; std::cout << ms_double.count() << "ms\n"; + + // Print profiling breakdown if enabled + nam::profiling::print_results(); } else { From 5c535246e531c7721c449b6c3c60871cbee92831 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= Date: Fri, 6 Feb 2026 10:20:55 -0800 Subject: [PATCH 2/2] Fixed build flags for benchmodel --- tools/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 8118e08..94adfce 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -16,6 +16,8 @@ add_executable(run_tests run_tests.cpp test/allocation_tracking.cpp ${NAM_SOURCE # Compile run_tests without optimizations to ensure allocation tracking works correctly # Also ensure assertions are enabled (NDEBUG is not defined) so tests actually run set_target_properties(run_tests PROPERTIES COMPILE_OPTIONS "-O0") +# Benchmodel should be built with NAM_PROFILING set +target_compile_definitions(benchmodel PRIVATE NAM_PROFILING) # Ensure assertions are enabled for run_tests by removing NDEBUG if it was set # Release/RelWithDebInfo/MinSizeRel build types automatically define NDEBUG # We use a compile option to undefine it, which works on GCC, Clang, and MSVC @@ -61,4 +63,4 @@ endif() # /Users/steve/src/NeuralAmpModelerCore/Dependencies/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h # Don't let this break my build on debug: set_source_files_properties(../NAM/dsp.cpp PROPERTIES COMPILE_FLAGS "-Wno-error") -set_source_files_properties(../NAM/conv1d.cpp PROPERTIES COMPILE_FLAGS "-Wno-error") \ No newline at end of file +set_source_files_properties(../NAM/conv1d.cpp PROPERTIES COMPILE_FLAGS "-Wno-error")