sdatkinson · jfsantos · Feb 6, 2026 · Feb 6, 2026
diff --git a/NAM/conv1d.cpp b/NAM/conv1d.cpp
@@ -1,4 +1,5 @@
 #include "conv1d.h"
+#include "profiling.h"
 #include <stdexcept>
 
 namespace nam
@@ -143,6 +144,9 @@ void Conv1D::SetMaxBufferSize(const int maxBufferSize)
 
 void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
 {
+  // Note: Profiling is done at the caller level (e.g., _Layer::Process in wavenet.cpp)
+  // to avoid double-counting when Conv1D is called from within profiled blocks.
+
   // Write input to ring buffer
   _input_buffer.Write(input, num_frames);
 

diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp
@@ -8,6 +8,7 @@
 #include <unordered_set>
 
 #include "dsp.h"
+#include "profiling.h"
 #include "registry.h"
 
 #define tanh_impl_ std::tanh
@@ -443,6 +444,9 @@ Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int nu
 
 void nam::Conv1x1::process_(const Eigen::Ref<const Eigen::MatrixXf>& input, const int num_frames)
 {
+  // Note: Profiling is done at the caller level (e.g., _Layer::Process in wavenet.cpp)
+  // to provide meaningful categories (input_mixin, layer1x1, head1x1, rechannel)
+  // rather than generic conv1x1.
   assert(num_frames <= _output.cols());
 
   if (this->_is_depthwise)

diff --git a/NAM/film.h b/NAM/film.h
@@ -81,9 +81,13 @@ class FiLM
     assert(num_frames <= condition.cols());
     assert(num_frames <= _output.cols());
 
+    // Conv1x1 to compute scale/shift from condition
     _cond_to_scale_shift.process_(condition, num_frames);
     const auto& scale_shift = _cond_to_scale_shift.GetOutput();
 
+    // Note: FiLM time is included in the caller's profiling category (e.g., conv1d, input_mixin)
+    // rather than tracked separately, to avoid double-counting.
+
     const auto scale = scale_shift.topRows(get_input_dim()).leftCols(num_frames);
     if (_do_shift)
     {

diff --git a/NAM/profiling.cpp b/NAM/profiling.cpp
@@ -0,0 +1,47 @@
+#include "profiling.h"
+
+#ifdef NAM_PROFILING
+
+#if defined(__ARM_ARCH_7EM__) || defined(ARM_MATH_CM7)
+// ARM Cortex-M7: Use DWT cycle counter for precise timing
+#include "stm32h7xx.h"
+
+namespace nam {
+namespace profiling {
+
+Timings g_timings;
+
+// CPU frequency in MHz (Daisy runs at 480 MHz)
+static constexpr uint32_t CPU_FREQ_MHZ = 480;
+
+uint32_t get_time_us() {
+  // DWT->CYCCNT gives cycle count
+  // Divide by CPU_FREQ_MHZ to get microseconds
+  return DWT->CYCCNT / CPU_FREQ_MHZ;
+}
+
+} // namespace profiling
+} // namespace nam
+
+#else
+// Non-ARM: Use std::chrono for timing (for testing on desktop)
+#include <chrono>
+
+namespace nam {
+namespace profiling {
+
+Timings g_timings;
+
+uint32_t get_time_us() {
+  using namespace std::chrono;
+  static auto start = high_resolution_clock::now();
+  auto now = high_resolution_clock::now();
+  return (uint32_t)duration_cast<microseconds>(now - start).count();
+}
+
+} // namespace profiling
+} // namespace nam
+
+#endif // ARM check
+
+#endif // NAM_PROFILING
diff --git a/NAM/profiling.h b/NAM/profiling.h
@@ -0,0 +1,153 @@
+#pragma once
+
+// Comprehensive profiling for NAM building blocks
+// Enable with -DNAM_PROFILING
+//
+// Usage:
+//   1. Call nam::profiling::reset() before benchmark
+//   2. Run model processing
+//   3. Call nam::profiling::print_results() to display breakdown
+//
+// Categories cover all WaveNet operations including FiLM modulation.
+
+#ifdef NAM_PROFILING
+
+#include <cstdint>
+#include <cstdio>
+
+namespace nam {
+namespace profiling {
+
+// Timing accumulators (in microseconds)
+struct Timings {
+  // Dilated convolution (Conv1D)
+  uint32_t conv1d = 0;
+
+  // Pointwise convolutions (Conv1x1 variants)
+  uint32_t input_mixin = 0;   // Input mixing Conv1x1
+  uint32_t layer1x1 = 0;      // Layer 1x1 (residual projection)
+  uint32_t head1x1 = 0;       // Head 1x1 (skip connection projection)
+  uint32_t rechannel = 0;     // Rechannel Conv1x1 (input/output)
+  uint32_t conv1x1 = 0;       // Other Conv1x1 (catch-all for non-WaveNet uses)
+
+  // Activation
+  uint32_t activation = 0;    // Activation functions (tanh, ReLU, Softsign, etc.)
+
+  // FiLM modulation
+  uint32_t film = 0;          // Feature-wise Linear Modulation (scale/shift)
+
+  // Memory operations
+  uint32_t copies = 0;        // Memory copies and additions
+  uint32_t setzero = 0;       // setZero() calls
+  uint32_t ringbuf = 0;       // Ring buffer operations (Write, Read, Advance)
+
+  // Conditioning
+  uint32_t condition = 0;     // Condition DSP processing
+
+  // LSTM (for LSTM models)
+  uint32_t lstm = 0;          // LSTM cell computations
+
+  // Catch-all
+  uint32_t other = 0;         // Everything else
+
+  void reset() {
+    conv1d = 0;
+    input_mixin = 0;
+    layer1x1 = 0;
+    head1x1 = 0;
+    rechannel = 0;
+    conv1x1 = 0;
+    activation = 0;
+    film = 0;
+    copies = 0;
+    setzero = 0;
+    ringbuf = 0;
+    condition = 0;
+    lstm = 0;
+    other = 0;
+  }
+
+  uint32_t total() const {
+    return conv1d + input_mixin + layer1x1 + head1x1 + rechannel + conv1x1 + activation + film + copies + setzero + ringbuf + condition + lstm + other;
+  }
+};
+
+// Global timing accumulator
+extern Timings g_timings;
+
+// Get current time in microseconds (platform-specific)
+uint32_t get_time_us();
+
+// Reset profiling counters
+inline void reset() { g_timings.reset(); }
+
+// Print profiling results to stdout
+inline void print_results() {
+  const auto& t = g_timings;
+  uint32_t total = t.total();
+
+  printf("\nProfiling breakdown:\n");
+  printf("%-12s %8s %6s\n", "Category", "Time(ms)", "%%");
+  printf("%-12s %8s %6s\n", "--------", "--------", "----");
+
+  auto print_row = [total](const char* name, uint32_t us) {
+    if (us > 0 || total == 0) {
+      uint32_t pct = total > 0 ? (us * 100 / total) : 0;
+      printf("%-12s %8.1f %5lu%%\n", name, us / 1000.0f, (unsigned long)pct);
+    }
+  };
+
+  print_row("Conv1D", t.conv1d);
+  print_row("InputMixin", t.input_mixin);
+  print_row("Layer1x1", t.layer1x1);
+  print_row("Head1x1", t.head1x1);
+  print_row("Rechannel", t.rechannel);
+  print_row("Conv1x1", t.conv1x1);
+  print_row("Activation", t.activation);
+  print_row("FiLM", t.film);
+  print_row("Copies", t.copies);
+  print_row("SetZero", t.setzero);
+  print_row("RingBuf", t.ringbuf);
+  print_row("Condition", t.condition);
+  print_row("LSTM", t.lstm);
+  print_row("Other", t.other);
+
+  printf("%-12s %8s %6s\n", "--------", "--------", "----");
+  printf("%-12s %8.1f %5s\n", "Total", total / 1000.0f, "100%");
+}
+
+// Helper macros for timing sections
+// Usage:
+//   NAM_PROFILE_START();
+//   // ... code to profile ...
+//   NAM_PROFILE_ADD(conv1d);  // Adds elapsed time to conv1d, resets timer
+
+#define NAM_PROFILE_START() uint32_t _prof_start = nam::profiling::get_time_us()
+#define NAM_PROFILE_ADD(category) do { \
+  uint32_t _prof_now = nam::profiling::get_time_us(); \
+  nam::profiling::g_timings.category += (_prof_now - _prof_start); \
+  _prof_start = _prof_now; \
+} while(0)
+
+// Variant that doesn't reset the timer (for one-shot measurements)
+#define NAM_PROFILE_ADD_NORESTART(category) \
+  nam::profiling::g_timings.category += (nam::profiling::get_time_us() - _prof_start)
+
+} // namespace profiling
+} // namespace nam
+
+#else // NAM_PROFILING not defined
+
+// No-op macros when profiling is disabled
+#define NAM_PROFILE_START() ((void)0)
+#define NAM_PROFILE_ADD(category) ((void)0)
+#define NAM_PROFILE_ADD_NORESTART(category) ((void)0)
+
+namespace nam {
+namespace profiling {
+  inline void reset() {}
+  inline void print_results() {}
+} // namespace profiling
+} // namespace nam
+
+#endif // NAM_PROFILING
diff --git a/NAM/wavenet.cpp b/NAM/wavenet.cpp
@@ -6,6 +6,7 @@
 #include <Eigen/Dense>
 
 #include "get_dsp.h"
+#include "profiling.h"
 #include "registry.h"
 #include "wavenet.h"
 
@@ -89,6 +90,8 @@ void nam::wavenet::_Layer::set_weights_(std::vector<float>::iterator& weights)
 
 void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::MatrixXf& condition, const int num_frames)
 {
+  NAM_PROFILE_START();
+
   const long bottleneck = this->_bottleneck; // Use the actual bottleneck value, not the doubled output channels
 
   // Step 1: input convolutions
@@ -107,6 +110,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     Eigen::MatrixXf& conv_output = this->_conv.GetOutput();
     this->_conv_post_film->Process_(conv_output, condition, num_frames);
   }
+  NAM_PROFILE_ADD(conv1d);
 
   if (this->_input_mixin_pre_film)
   {
@@ -123,8 +127,12 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     Eigen::MatrixXf& input_mixin_output = this->_input_mixin.GetOutput();
     this->_input_mixin_post_film->Process_(input_mixin_output, condition, num_frames);
   }
+  NAM_PROFILE_ADD(input_mixin);
+
   this->_z.leftCols(num_frames).noalias() =
     _conv.GetOutput().leftCols(num_frames) + _input_mixin.GetOutput().leftCols(num_frames);
+  NAM_PROFILE_ADD(copies);
+
   if (this->_activation_pre_film)
   {
     this->_activation_pre_film->Process_(this->_z, condition, num_frames);
@@ -139,13 +147,15 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
   if (this->_gating_mode == GatingMode::NONE)
   {
     this->_activation->apply(this->_z.leftCols(num_frames));
+    NAM_PROFILE_ADD(activation);
     if (this->_activation_post_film)
     {
       this->_activation_post_film->Process_(this->_z, condition, num_frames);
     }
     if (this->_layer1x1)
     {
       this->_layer1x1->process_(this->_z, num_frames);
+      NAM_PROFILE_ADD(layer1x1);
     }
   }
   else if (this->_gating_mode == GatingMode::GATED)
@@ -155,6 +165,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     auto input_block = this->_z.leftCols(num_frames);
     auto output_block = this->_z.topRows(bottleneck).leftCols(num_frames);
     this->_gating_activation->apply(input_block, output_block);
+    NAM_PROFILE_ADD(activation);
     if (this->_activation_post_film)
     {
       // Use Process() for blocks and copy result back
@@ -165,6 +176,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     if (this->_layer1x1)
     {
       this->_layer1x1->process_(this->_z.topRows(bottleneck), num_frames);
+      NAM_PROFILE_ADD(layer1x1);
     }
   }
   else if (this->_gating_mode == GatingMode::BLENDED)
@@ -174,6 +186,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     auto input_block = this->_z.leftCols(num_frames);
     auto output_block = this->_z.topRows(bottleneck).leftCols(num_frames);
     this->_blending_activation->apply(input_block, output_block);
+    NAM_PROFILE_ADD(activation);
     if (this->_activation_post_film)
     {
       // Use Process() for blocks and copy result back
@@ -184,6 +197,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     if (this->_layer1x1)
     {
       this->_layer1x1->process_(this->_z.topRows(bottleneck), num_frames);
+      NAM_PROFILE_ADD(layer1x1);
       if (this->_layer1x1_post_film)
       {
         Eigen::MatrixXf& layer1x1_output = this->_layer1x1->GetOutput();
@@ -207,6 +221,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
       Eigen::MatrixXf& head1x1_output = this->_head1x1->GetOutput();
       this->_head1x1_post_film->Process_(head1x1_output, condition, num_frames);
     }
+    NAM_PROFILE_ADD(head1x1);
     this->_output_head.leftCols(num_frames).noalias() = this->_head1x1->GetOutput().leftCols(num_frames);
   }
   else // No head 1x1
@@ -230,6 +245,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     // If layer1x1 is inactive, residual connection is just the input (identity)
     this->_output_next_layer.leftCols(num_frames).noalias() = input.leftCols(num_frames);
   }
+  NAM_PROFILE_ADD(copies);
 }
 
 // LayerArray =================================================================
@@ -298,9 +314,12 @@ void nam::wavenet::_LayerArray::Process(const Eigen::MatrixXf& layer_inputs, con
 void nam::wavenet::_LayerArray::ProcessInner(const Eigen::MatrixXf& layer_inputs, const Eigen::MatrixXf& condition,
                                              const int num_frames)
 {
+  NAM_PROFILE_START();
+
   // Process rechannel and get output
   this->_rechannel.process_(layer_inputs, num_frames);
   Eigen::MatrixXf& rechannel_output = _rechannel.GetOutput();
+  NAM_PROFILE_ADD(rechannel);
 
   // Process layers
   for (size_t i = 0; i < this->_layers.size(); i++)
@@ -329,7 +348,11 @@ void nam::wavenet::_LayerArray::ProcessInner(const Eigen::MatrixXf& layer_inputs
     this->_layers[last_layer].GetOutputNextLayer().leftCols(num_frames);
 
   // Process head rechannel
+#ifdef NAM_PROFILING
+  _prof_start = nam::profiling::get_time_us();  // Reset timer for accurate head_rechannel measurement
+#endif
   _head_rechannel.process_(this->_head_inputs, num_frames);
+  NAM_PROFILE_ADD(rechannel);
 }
 
 

diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
@@ -16,6 +16,8 @@ add_executable(run_tests run_tests.cpp test/allocation_tracking.cpp ${NAM_SOURCE
 # Compile run_tests without optimizations to ensure allocation tracking works correctly
 # Also ensure assertions are enabled (NDEBUG is not defined) so tests actually run
 set_target_properties(run_tests PROPERTIES COMPILE_OPTIONS "-O0")
+# Benchmodel should be built with NAM_PROFILING set
+target_compile_definitions(benchmodel PRIVATE NAM_PROFILING)
 # Ensure assertions are enabled for run_tests by removing NDEBUG if it was set
 # Release/RelWithDebInfo/MinSizeRel build types automatically define NDEBUG
 # We use a compile option to undefine it, which works on GCC, Clang, and MSVC
@@ -61,4 +63,4 @@ endif()
 # /Users/steve/src/NeuralAmpModelerCore/Dependencies/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h
 # Don't let this break my build on debug:
 set_source_files_properties(../NAM/dsp.cpp PROPERTIES COMPILE_FLAGS "-Wno-error")
-set_source_files_properties(../NAM/conv1d.cpp PROPERTIES COMPILE_FLAGS "-Wno-error")
+set_source_files_properties(../NAM/conv1d.cpp PROPERTIES COMPILE_FLAGS "-Wno-error")