From 4832cda1b28d09026840255e6c4dfa9e8796a789 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= <santosjf@pm.me>
Date: Fri, 6 Feb 2026 09:33:50 -0800
Subject: [PATCH 1/2] Add profiling instrumentation for NAM building blocks

Adds a profiling framework (NAM/profiling.h, NAM/profiling.cpp) with
NAM_PROFILE_START()/NAM_PROFILE_ADD() macros and 14 timing categories.
Supports both desktop (std::chrono) and ARM Cortex-M7 (DWT cycle counter)
backends. Profiling is compile-time gated via -DNAM_PROFILING.

Instruments wavenet _Layer::Process() and _LayerArray::ProcessInner()
with per-category timing, and adds profiling reset/print calls to the
benchmodel tool.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 NAM/conv1d.cpp       |   4 ++
 NAM/dsp.cpp          |   4 ++
 NAM/film.h           |   4 ++
 NAM/profiling.cpp    |  47 +++++++++++++
 NAM/profiling.h      | 153 +++++++++++++++++++++++++++++++++++++++++++
 NAM/wavenet.cpp      |  23 +++++++
 tools/benchmodel.cpp |   7 ++
 7 files changed, 242 insertions(+)
 create mode 100644 NAM/profiling.cpp
 create mode 100644 NAM/profiling.h
diff --git a/NAM/conv1d.cpp b/NAM/conv1d.cpp
index 9bbbc02..d440f0c 100644
--- a/NAM/conv1d.cpp
+++ b/NAM/conv1d.cpp
@@ -1,4 +1,5 @@
 #include "conv1d.h"
+#include "profiling.h"
 #include <stdexcept>
 
 namespace nam
@@ -143,6 +144,9 @@ void Conv1D::SetMaxBufferSize(const int maxBufferSize)
 
 void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
 {
+  // Note: Profiling is done at the caller level (e.g., _Layer::Process in wavenet.cpp)
+  // to avoid double-counting when Conv1D is called from within profiled blocks.
+
   // Write input to ring buffer
   _input_buffer.Write(input, num_frames);
 
diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp
index 05dab09..b644af3 100644
--- a/NAM/dsp.cpp
+++ b/NAM/dsp.cpp
@@ -8,6 +8,7 @@
 #include <unordered_set>
 
 #include "dsp.h"
+#include "profiling.h"
 #include "registry.h"
 
 #define tanh_impl_ std::tanh
@@ -443,6 +444,9 @@ Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int nu
 
 void nam::Conv1x1::process_(const Eigen::Ref<const Eigen::MatrixXf>& input, const int num_frames)
 {
+  // Note: Profiling is done at the caller level (e.g., _Layer::Process in wavenet.cpp)
+  // to provide meaningful categories (input_mixin, layer1x1, head1x1, rechannel)
+  // rather than generic conv1x1.
   assert(num_frames <= _output.cols());
 
   if (this->_is_depthwise)
diff --git a/NAM/film.h b/NAM/film.h
index f0f86fb..eeb750a 100644
--- a/NAM/film.h
+++ b/NAM/film.h
@@ -81,9 +81,13 @@ class FiLM
     assert(num_frames <= condition.cols());
     assert(num_frames <= _output.cols());
 
+    // Conv1x1 to compute scale/shift from condition
     _cond_to_scale_shift.process_(condition, num_frames);
     const auto& scale_shift = _cond_to_scale_shift.GetOutput();
 
+    // Note: FiLM time is included in the caller's profiling category (e.g., conv1d, input_mixin)
+    // rather than tracked separately, to avoid double-counting.
+
     const auto scale = scale_shift.topRows(get_input_dim()).leftCols(num_frames);
     if (_do_shift)
     {
diff --git a/NAM/profiling.cpp b/NAM/profiling.cpp
new file mode 100644
index 0000000..65d430e
--- /dev/null
+++ b/NAM/profiling.cpp
@@ -0,0 +1,47 @@
+#include "profiling.h"
+
+#ifdef NAM_PROFILING
+
+#if defined(__ARM_ARCH_7EM__) || defined(ARM_MATH_CM7)
+// ARM Cortex-M7: Use DWT cycle counter for precise timing
+#include "stm32h7xx.h"
+
+namespace nam {
+namespace profiling {
+
+Timings g_timings;
+
+// CPU frequency in MHz (Daisy runs at 480 MHz)
+static constexpr uint32_t CPU_FREQ_MHZ = 480;
+
+uint32_t get_time_us() {
+  // DWT->CYCCNT gives cycle count
+  // Divide by CPU_FREQ_MHZ to get microseconds
+  return DWT->CYCCNT / CPU_FREQ_MHZ;
+}
+
+} // namespace profiling
+} // namespace nam
+
+#else
+// Non-ARM: Use std::chrono for timing (for testing on desktop)
+#include <chrono>
+
+namespace nam {
+namespace profiling {
+
+Timings g_timings;
+
+uint32_t get_time_us() {
+  using namespace std::chrono;
+  static auto start = high_resolution_clock::now();
+  auto now = high_resolution_clock::now();
+  return (uint32_t)duration_cast<microseconds>(now - start).count();
+}
+
+} // namespace profiling
+} // namespace nam
+
+#endif // ARM check
+
+#endif // NAM_PROFILING
diff --git a/NAM/profiling.h b/NAM/profiling.h
new file mode 100644
index 0000000..71031fe
--- /dev/null
+++ b/NAM/profiling.h
@@ -0,0 +1,153 @@
+#pragma once
+
+// Comprehensive profiling for NAM building blocks
+// Enable with -DNAM_PROFILING
+//
+// Usage:
+//   1. Call nam::profiling::reset() before benchmark
+//   2. Run model processing
+//   3. Call nam::profiling::print_results() to display breakdown
+//
+// Categories cover all WaveNet operations including FiLM modulation.
+
+#ifdef NAM_PROFILING
+
+#include <cstdint>
+#include <cstdio>
+
+namespace nam {
+namespace profiling {
+
+// Timing accumulators (in microseconds)
+struct Timings {
+  // Dilated convolution (Conv1D)
+  uint32_t conv1d = 0;
+
+  // Pointwise convolutions (Conv1x1 variants)
+  uint32_t input_mixin = 0;   // Input mixing Conv1x1
+  uint32_t layer1x1 = 0;      // Layer 1x1 (residual projection)
+  uint32_t head1x1 = 0;       // Head 1x1 (skip connection projection)
+  uint32_t rechannel = 0;     // Rechannel Conv1x1 (input/output)
+  uint32_t conv1x1 = 0;       // Other Conv1x1 (catch-all for non-WaveNet uses)
+
+  // Activation
+  uint32_t activation = 0;    // Activation functions (tanh, ReLU, Softsign, etc.)
+
+  // FiLM modulation
+  uint32_t film = 0;          // Feature-wise Linear Modulation (scale/shift)
+
+  // Memory operations
+  uint32_t copies = 0;        // Memory copies and additions
+  uint32_t setzero = 0;       // setZero() calls
+  uint32_t ringbuf = 0;       // Ring buffer operations (Write, Read, Advance)
+
+  // Conditioning
+  uint32_t condition = 0;     // Condition DSP processing
+
+  // LSTM (for LSTM models)
+  uint32_t lstm = 0;          // LSTM cell computations
+
+  // Catch-all
+  uint32_t other = 0;         // Everything else
+
+  void reset() {
+    conv1d = 0;
+    input_mixin = 0;
+    layer1x1 = 0;
+    head1x1 = 0;
+    rechannel = 0;
+    conv1x1 = 0;
+    activation = 0;
+    film = 0;
+    copies = 0;
+    setzero = 0;
+    ringbuf = 0;
+    condition = 0;
+    lstm = 0;
+    other = 0;
+  }
+
+  uint32_t total() const {
+    return conv1d + input_mixin + layer1x1 + head1x1 + rechannel + conv1x1 + activation + film + copies + setzero + ringbuf + condition + lstm + other;
+  }
+};
+
+// Global timing accumulator
+extern Timings g_timings;
+
+// Get current time in microseconds (platform-specific)
+uint32_t get_time_us();
+
+// Reset profiling counters
+inline void reset() { g_timings.reset(); }
+
+// Print profiling results to stdout
+inline void print_results() {
+  const auto& t = g_timings;
+  uint32_t total = t.total();
+
+  printf("\nProfiling breakdown:\n");
+  printf("%-12s %8s %6s\n", "Category", "Time(ms)", "%%");
+  printf("%-12s %8s %6s\n", "--------", "--------", "----");
+
+  auto print_row = [total](const char* name, uint32_t us) {
+    if (us > 0 || total == 0) {
+      uint32_t pct = total > 0 ? (us * 100 / total) : 0;
+      printf("%-12s %8.1f %5lu%%\n", name, us / 1000.0f, (unsigned long)pct);
+    }
+  };
+
+  print_row("Conv1D", t.conv1d);
+  print_row("InputMixin", t.input_mixin);
+  print_row("Layer1x1", t.layer1x1);
+  print_row("Head1x1", t.head1x1);
+  print_row("Rechannel", t.rechannel);
+  print_row("Conv1x1", t.conv1x1);
+  print_row("Activation", t.activation);
+  print_row("FiLM", t.film);
+  print_row("Copies", t.copies);
+  print_row("SetZero", t.setzero);
+  print_row("RingBuf", t.ringbuf);
+  print_row("Condition", t.condition);
+  print_row("LSTM", t.lstm);
+  print_row("Other", t.other);
+
+  printf("%-12s %8s %6s\n", "--------", "--------", "----");
+  printf("%-12s %8.1f %5s\n", "Total", total / 1000.0f, "100%");
+}
+
+// Helper macros for timing sections
+// Usage:
+//   NAM_PROFILE_START();
+//   // ... code to profile ...
+//   NAM_PROFILE_ADD(conv1d);  // Adds elapsed time to conv1d, resets timer
+
+#define NAM_PROFILE_START() uint32_t _prof_start = nam::profiling::get_time_us()
+#define NAM_PROFILE_ADD(category) do { \
+  uint32_t _prof_now = nam::profiling::get_time_us(); \
+  nam::profiling::g_timings.category += (_prof_now - _prof_start); \
+  _prof_start = _prof_now; \
+} while(0)
+
+// Variant that doesn't reset the timer (for one-shot measurements)
+#define NAM_PROFILE_ADD_NORESTART(category) \
+  nam::profiling::g_timings.category += (nam::profiling::get_time_us() - _prof_start)
+
+} // namespace profiling
+} // namespace nam
+
+#else // NAM_PROFILING not defined
+
+// No-op macros when profiling is disabled
+#define NAM_PROFILE_START() ((void)0)
+#define NAM_PROFILE_ADD(category) ((void)0)
+#define NAM_PROFILE_ADD_NORESTART(category) ((void)0)
+
+namespace nam {
+namespace profiling {
+  inline void reset() {}
+  inline void print_results() {}
+} // namespace profiling
+} // namespace nam
+
+#endif // NAM_PROFILING
diff --git a/NAM/wavenet.cpp b/NAM/wavenet.cpp
index 7d9b5d0..6112169 100644
--- a/NAM/wavenet.cpp
+++ b/NAM/wavenet.cpp
@@ -6,6 +6,7 @@
 #include <Eigen/Dense>
 
 #include "get_dsp.h"
+#include "profiling.h"
 #include "registry.h"
 #include "wavenet.h"
 
@@ -89,6 +90,8 @@ void nam::wavenet::_Layer::set_weights_(std::vector<float>::iterator& weights)
 
 void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::MatrixXf& condition, const int num_frames)
 {
+  NAM_PROFILE_START();
+
   const long bottleneck = this->_bottleneck; // Use the actual bottleneck value, not the doubled output channels
 
   // Step 1: input convolutions
@@ -107,6 +110,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     Eigen::MatrixXf& conv_output = this->_conv.GetOutput();
     this->_conv_post_film->Process_(conv_output, condition, num_frames);
   }
+  NAM_PROFILE_ADD(conv1d);
 
   if (this->_input_mixin_pre_film)
   {
@@ -123,8 +127,12 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     Eigen::MatrixXf& input_mixin_output = this->_input_mixin.GetOutput();
     this->_input_mixin_post_film->Process_(input_mixin_output, condition, num_frames);
   }
+  NAM_PROFILE_ADD(input_mixin);
+
   this->_z.leftCols(num_frames).noalias() =
     _conv.GetOutput().leftCols(num_frames) + _input_mixin.GetOutput().leftCols(num_frames);
+  NAM_PROFILE_ADD(copies);
+
   if (this->_activation_pre_film)
   {
     this->_activation_pre_film->Process_(this->_z, condition, num_frames);
@@ -139,6 +147,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
   if (this->_gating_mode == GatingMode::NONE)
   {
     this->_activation->apply(this->_z.leftCols(num_frames));
+    NAM_PROFILE_ADD(activation);
     if (this->_activation_post_film)
     {
       this->_activation_post_film->Process_(this->_z, condition, num_frames);
@@ -146,6 +155,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     if (this->_layer1x1)
     {
       this->_layer1x1->process_(this->_z, num_frames);
+      NAM_PROFILE_ADD(layer1x1);
     }
   }
   else if (this->_gating_mode == GatingMode::GATED)
@@ -155,6 +165,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     auto input_block = this->_z.leftCols(num_frames);
     auto output_block = this->_z.topRows(bottleneck).leftCols(num_frames);
     this->_gating_activation->apply(input_block, output_block);
+    NAM_PROFILE_ADD(activation);
     if (this->_activation_post_film)
     {
       // Use Process() for blocks and copy result back
@@ -165,6 +176,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     if (this->_layer1x1)
     {
       this->_layer1x1->process_(this->_z.topRows(bottleneck), num_frames);
+      NAM_PROFILE_ADD(layer1x1);
     }
   }
   else if (this->_gating_mode == GatingMode::BLENDED)
@@ -174,6 +186,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     auto input_block = this->_z.leftCols(num_frames);
     auto output_block = this->_z.topRows(bottleneck).leftCols(num_frames);
     this->_blending_activation->apply(input_block, output_block);
+    NAM_PROFILE_ADD(activation);
     if (this->_activation_post_film)
     {
       // Use Process() for blocks and copy result back
@@ -184,6 +197,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     if (this->_layer1x1)
     {
       this->_layer1x1->process_(this->_z.topRows(bottleneck), num_frames);
+      NAM_PROFILE_ADD(layer1x1);
       if (this->_layer1x1_post_film)
       {
         Eigen::MatrixXf& layer1x1_output = this->_layer1x1->GetOutput();
@@ -207,6 +221,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
       Eigen::MatrixXf& head1x1_output = this->_head1x1->GetOutput();
       this->_head1x1_post_film->Process_(head1x1_output, condition, num_frames);
     }
+    NAM_PROFILE_ADD(head1x1);
     this->_output_head.leftCols(num_frames).noalias() = this->_head1x1->GetOutput().leftCols(num_frames);
   }
   else // No head 1x1
@@ -230,6 +245,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     // If layer1x1 is inactive, residual connection is just the input (identity)
     this->_output_next_layer.leftCols(num_frames).noalias() = input.leftCols(num_frames);
   }
+  NAM_PROFILE_ADD(copies);
 }
 
 // LayerArray =================================================================
@@ -298,9 +314,12 @@ void nam::wavenet::_LayerArray::Process(const Eigen::MatrixXf& layer_inputs, con
 void nam::wavenet::_LayerArray::ProcessInner(const Eigen::MatrixXf& layer_inputs, const Eigen::MatrixXf& condition,
                                              const int num_frames)
 {
+  NAM_PROFILE_START();
+
   // Process rechannel and get output
   this->_rechannel.process_(layer_inputs, num_frames);
   Eigen::MatrixXf& rechannel_output = _rechannel.GetOutput();
+  NAM_PROFILE_ADD(rechannel);
 
   // Process layers
   for (size_t i = 0; i < this->_layers.size(); i++)
@@ -329,7 +348,11 @@ void nam::wavenet::_LayerArray::ProcessInner(const Eigen::MatrixXf& layer_inputs
     this->_layers[last_layer].GetOutputNextLayer().leftCols(num_frames);
 
   // Process head rechannel
+#ifdef NAM_PROFILING
+  _prof_start = nam::profiling::get_time_us();  // Reset timer for accurate head_rechannel measurement
+#endif
   _head_rechannel.process_(this->_head_inputs, num_frames);
+  NAM_PROFILE_ADD(rechannel);
 }
 
 
diff --git a/tools/benchmodel.cpp b/tools/benchmodel.cpp
index 39c14b0..42556f5 100644
--- a/tools/benchmodel.cpp
+++ b/tools/benchmodel.cpp
@@ -4,6 +4,7 @@
 
 #include "NAM/dsp.h"
 #include "NAM/get_dsp.h"
+#include "NAM/profiling.h"
 
 using std::chrono::duration;
 using std::chrono::duration_cast;
@@ -62,6 +63,9 @@ int main(int argc, char* argv[])
       outputPtrs[ch] = outputBuffers[ch].data();
     }
 
+    // Reset profiling counters before benchmark
+    nam::profiling::reset();
+
     std::cout << "Running benchmark\n";
     auto t1 = high_resolution_clock::now();
     for (size_t i = 0; i < numBuffers; i++)
@@ -80,6 +84,9 @@ int main(int argc, char* argv[])
 
     std::cout << ms_int.count() << "ms\n";
     std::cout << ms_double.count() << "ms\n";
+
+    // Print profiling breakdown if enabled
+    nam::profiling::print_results();
   }
   else
   {

From 5c535246e531c7721c449b6c3c60871cbee92831 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= <santosjf@pm.me>
Date: Fri, 6 Feb 2026 10:20:55 -0800
Subject: [PATCH 2/2] Fixed build flags for benchmodel

---
 tools/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 8118e08..94adfce 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -16,6 +16,8 @@ add_executable(run_tests run_tests.cpp test/allocation_tracking.cpp ${NAM_SOURCE
 # Compile run_tests without optimizations to ensure allocation tracking works correctly
 # Also ensure assertions are enabled (NDEBUG is not defined) so tests actually run
 set_target_properties(run_tests PROPERTIES COMPILE_OPTIONS "-O0")
+# Benchmodel should be built with NAM_PROFILING set
+target_compile_definitions(benchmodel PRIVATE NAM_PROFILING)
 # Ensure assertions are enabled for run_tests by removing NDEBUG if it was set
 # Release/RelWithDebInfo/MinSizeRel build types automatically define NDEBUG
 # We use a compile option to undefine it, which works on GCC, Clang, and MSVC
@@ -61,4 +63,4 @@ endif()
 # /Users/steve/src/NeuralAmpModelerCore/Dependencies/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h
 # Don't let this break my build on debug:
 set_source_files_properties(../NAM/dsp.cpp PROPERTIES COMPILE_FLAGS "-Wno-error")
-set_source_files_properties(../NAM/conv1d.cpp PROPERTIES COMPILE_FLAGS "-Wno-error")
\ No newline at end of file
+set_source_files_properties(../NAM/conv1d.cpp PROPERTIES COMPILE_FLAGS "-Wno-error")