Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions NAM/conv1d.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "conv1d.h"
#include "profiling.h"
#include <stdexcept>

namespace nam
Expand Down Expand Up @@ -143,6 +144,9 @@ void Conv1D::SetMaxBufferSize(const int maxBufferSize)

void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
{
// Note: Profiling is done at the caller level (e.g., _Layer::Process in wavenet.cpp)
// to avoid double-counting when Conv1D is called from within profiled blocks.

// Write input to ring buffer
_input_buffer.Write(input, num_frames);

Expand Down
4 changes: 4 additions & 0 deletions NAM/dsp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <unordered_set>

#include "dsp.h"
#include "profiling.h"
#include "registry.h"

#define tanh_impl_ std::tanh
Expand Down Expand Up @@ -443,6 +444,9 @@ Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int nu

void nam::Conv1x1::process_(const Eigen::Ref<const Eigen::MatrixXf>& input, const int num_frames)
{
// Note: Profiling is done at the caller level (e.g., _Layer::Process in wavenet.cpp)
// to provide meaningful categories (input_mixin, layer1x1, head1x1, rechannel)
// rather than generic conv1x1.
assert(num_frames <= _output.cols());

if (this->_is_depthwise)
Expand Down
4 changes: 4 additions & 0 deletions NAM/film.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,13 @@ class FiLM
assert(num_frames <= condition.cols());
assert(num_frames <= _output.cols());

// Conv1x1 to compute scale/shift from condition
_cond_to_scale_shift.process_(condition, num_frames);
const auto& scale_shift = _cond_to_scale_shift.GetOutput();

// Note: FiLM time is included in the caller's profiling category (e.g., conv1d, input_mixin)
// rather than tracked separately, to avoid double-counting.

const auto scale = scale_shift.topRows(get_input_dim()).leftCols(num_frames);
if (_do_shift)
{
Expand Down
47 changes: 47 additions & 0 deletions NAM/profiling.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#include "profiling.h"

#ifdef NAM_PROFILING

#if defined(__ARM_ARCH_7EM__) || defined(ARM_MATH_CM7)
// ARM Cortex-M7: Use DWT cycle counter for precise timing
#include "stm32h7xx.h"

namespace nam {
namespace profiling {

Timings g_timings;

// CPU frequency in MHz (Daisy runs at 480 MHz)
static constexpr uint32_t CPU_FREQ_MHZ = 480;

uint32_t get_time_us() {
// DWT->CYCCNT gives cycle count
// Divide by CPU_FREQ_MHZ to get microseconds
return DWT->CYCCNT / CPU_FREQ_MHZ;
}

} // namespace profiling
} // namespace nam

#else
// Non-ARM: Use std::chrono for timing (for testing on desktop)
#include <chrono>

namespace nam {
namespace profiling {

Timings g_timings;

uint32_t get_time_us() {
using namespace std::chrono;
static auto start = high_resolution_clock::now();
auto now = high_resolution_clock::now();
return (uint32_t)duration_cast<microseconds>(now - start).count();
}

} // namespace profiling
} // namespace nam

#endif // ARM check

#endif // NAM_PROFILING
153 changes: 153 additions & 0 deletions NAM/profiling.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
#pragma once

// Comprehensive profiling for NAM building blocks
// Enable with -DNAM_PROFILING
//
// Usage:
// 1. Call nam::profiling::reset() before benchmark
// 2. Run model processing
// 3. Call nam::profiling::print_results() to display breakdown
//
// Categories cover all WaveNet operations including FiLM modulation.

#ifdef NAM_PROFILING

#include <cstdint>
#include <cstdio>

namespace nam {
namespace profiling {

// Timing accumulators (in microseconds)
struct Timings {
// Dilated convolution (Conv1D)
uint32_t conv1d = 0;

// Pointwise convolutions (Conv1x1 variants)
uint32_t input_mixin = 0; // Input mixing Conv1x1
uint32_t layer1x1 = 0; // Layer 1x1 (residual projection)
uint32_t head1x1 = 0; // Head 1x1 (skip connection projection)
uint32_t rechannel = 0; // Rechannel Conv1x1 (input/output)
uint32_t conv1x1 = 0; // Other Conv1x1 (catch-all for non-WaveNet uses)

// Activation
uint32_t activation = 0; // Activation functions (tanh, ReLU, Softsign, etc.)

// FiLM modulation
uint32_t film = 0; // Feature-wise Linear Modulation (scale/shift)

// Memory operations
uint32_t copies = 0; // Memory copies and additions
uint32_t setzero = 0; // setZero() calls
uint32_t ringbuf = 0; // Ring buffer operations (Write, Read, Advance)

// Conditioning
uint32_t condition = 0; // Condition DSP processing

// LSTM (for LSTM models)
uint32_t lstm = 0; // LSTM cell computations

// Catch-all
uint32_t other = 0; // Everything else

void reset() {
conv1d = 0;
input_mixin = 0;
layer1x1 = 0;
head1x1 = 0;
rechannel = 0;
conv1x1 = 0;
activation = 0;
film = 0;
copies = 0;
setzero = 0;
ringbuf = 0;
condition = 0;
lstm = 0;
other = 0;
}

uint32_t total() const {
return conv1d + input_mixin + layer1x1 + head1x1 + rechannel + conv1x1 + activation + film + copies + setzero + ringbuf + condition + lstm + other;
}
};

// Global timing accumulator
extern Timings g_timings;

// Get current time in microseconds (platform-specific)
uint32_t get_time_us();

// Reset profiling counters
inline void reset() { g_timings.reset(); }

// Print profiling results to stdout
inline void print_results() {
const auto& t = g_timings;
uint32_t total = t.total();

printf("\nProfiling breakdown:\n");
printf("%-12s %8s %6s\n", "Category", "Time(ms)", "%%");
printf("%-12s %8s %6s\n", "--------", "--------", "----");

auto print_row = [total](const char* name, uint32_t us) {
if (us > 0 || total == 0) {
uint32_t pct = total > 0 ? (us * 100 / total) : 0;
printf("%-12s %8.1f %5lu%%\n", name, us / 1000.0f, (unsigned long)pct);
}
};

print_row("Conv1D", t.conv1d);
print_row("InputMixin", t.input_mixin);
print_row("Layer1x1", t.layer1x1);
print_row("Head1x1", t.head1x1);
print_row("Rechannel", t.rechannel);
print_row("Conv1x1", t.conv1x1);
print_row("Activation", t.activation);
print_row("FiLM", t.film);
print_row("Copies", t.copies);
print_row("SetZero", t.setzero);
print_row("RingBuf", t.ringbuf);
print_row("Condition", t.condition);
print_row("LSTM", t.lstm);
print_row("Other", t.other);

printf("%-12s %8s %6s\n", "--------", "--------", "----");
printf("%-12s %8.1f %5s\n", "Total", total / 1000.0f, "100%");
}

// Helper macros for timing sections
// Usage:
// NAM_PROFILE_START();
// // ... code to profile ...
// NAM_PROFILE_ADD(conv1d); // Adds elapsed time to conv1d, resets timer

#define NAM_PROFILE_START() uint32_t _prof_start = nam::profiling::get_time_us()
#define NAM_PROFILE_ADD(category) do { \
uint32_t _prof_now = nam::profiling::get_time_us(); \
nam::profiling::g_timings.category += (_prof_now - _prof_start); \
_prof_start = _prof_now; \
} while(0)

// Variant that doesn't reset the timer (for one-shot measurements)
#define NAM_PROFILE_ADD_NORESTART(category) \
nam::profiling::g_timings.category += (nam::profiling::get_time_us() - _prof_start)

} // namespace profiling
} // namespace nam

#else // NAM_PROFILING not defined

// No-op macros when profiling is disabled
#define NAM_PROFILE_START() ((void)0)
#define NAM_PROFILE_ADD(category) ((void)0)
#define NAM_PROFILE_ADD_NORESTART(category) ((void)0)

namespace nam {
namespace profiling {
inline void reset() {}
inline void print_results() {}
} // namespace profiling
} // namespace nam

#endif // NAM_PROFILING
23 changes: 23 additions & 0 deletions NAM/wavenet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <Eigen/Dense>

#include "get_dsp.h"
#include "profiling.h"
#include "registry.h"
#include "wavenet.h"

Expand Down Expand Up @@ -89,6 +90,8 @@ void nam::wavenet::_Layer::set_weights_(std::vector<float>::iterator& weights)

void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::MatrixXf& condition, const int num_frames)
{
NAM_PROFILE_START();

const long bottleneck = this->_bottleneck; // Use the actual bottleneck value, not the doubled output channels

// Step 1: input convolutions
Expand All @@ -107,6 +110,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
Eigen::MatrixXf& conv_output = this->_conv.GetOutput();
this->_conv_post_film->Process_(conv_output, condition, num_frames);
}
NAM_PROFILE_ADD(conv1d);

if (this->_input_mixin_pre_film)
{
Expand All @@ -123,8 +127,12 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
Eigen::MatrixXf& input_mixin_output = this->_input_mixin.GetOutput();
this->_input_mixin_post_film->Process_(input_mixin_output, condition, num_frames);
}
NAM_PROFILE_ADD(input_mixin);

this->_z.leftCols(num_frames).noalias() =
_conv.GetOutput().leftCols(num_frames) + _input_mixin.GetOutput().leftCols(num_frames);
NAM_PROFILE_ADD(copies);

if (this->_activation_pre_film)
{
this->_activation_pre_film->Process_(this->_z, condition, num_frames);
Expand All @@ -139,13 +147,15 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
if (this->_gating_mode == GatingMode::NONE)
{
this->_activation->apply(this->_z.leftCols(num_frames));
NAM_PROFILE_ADD(activation);
if (this->_activation_post_film)
{
this->_activation_post_film->Process_(this->_z, condition, num_frames);
}
if (this->_layer1x1)
{
this->_layer1x1->process_(this->_z, num_frames);
NAM_PROFILE_ADD(layer1x1);
}
}
else if (this->_gating_mode == GatingMode::GATED)
Expand All @@ -155,6 +165,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
auto input_block = this->_z.leftCols(num_frames);
auto output_block = this->_z.topRows(bottleneck).leftCols(num_frames);
this->_gating_activation->apply(input_block, output_block);
NAM_PROFILE_ADD(activation);
if (this->_activation_post_film)
{
// Use Process() for blocks and copy result back
Expand All @@ -165,6 +176,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
if (this->_layer1x1)
{
this->_layer1x1->process_(this->_z.topRows(bottleneck), num_frames);
NAM_PROFILE_ADD(layer1x1);
}
}
else if (this->_gating_mode == GatingMode::BLENDED)
Expand All @@ -174,6 +186,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
auto input_block = this->_z.leftCols(num_frames);
auto output_block = this->_z.topRows(bottleneck).leftCols(num_frames);
this->_blending_activation->apply(input_block, output_block);
NAM_PROFILE_ADD(activation);
if (this->_activation_post_film)
{
// Use Process() for blocks and copy result back
Expand All @@ -184,6 +197,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
if (this->_layer1x1)
{
this->_layer1x1->process_(this->_z.topRows(bottleneck), num_frames);
NAM_PROFILE_ADD(layer1x1);
if (this->_layer1x1_post_film)
{
Eigen::MatrixXf& layer1x1_output = this->_layer1x1->GetOutput();
Expand All @@ -207,6 +221,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
Eigen::MatrixXf& head1x1_output = this->_head1x1->GetOutput();
this->_head1x1_post_film->Process_(head1x1_output, condition, num_frames);
}
NAM_PROFILE_ADD(head1x1);
this->_output_head.leftCols(num_frames).noalias() = this->_head1x1->GetOutput().leftCols(num_frames);
}
else // No head 1x1
Expand All @@ -230,6 +245,7 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
// If layer1x1 is inactive, residual connection is just the input (identity)
this->_output_next_layer.leftCols(num_frames).noalias() = input.leftCols(num_frames);
}
NAM_PROFILE_ADD(copies);
}

// LayerArray =================================================================
Expand Down Expand Up @@ -298,9 +314,12 @@ void nam::wavenet::_LayerArray::Process(const Eigen::MatrixXf& layer_inputs, con
void nam::wavenet::_LayerArray::ProcessInner(const Eigen::MatrixXf& layer_inputs, const Eigen::MatrixXf& condition,
const int num_frames)
{
NAM_PROFILE_START();

// Process rechannel and get output
this->_rechannel.process_(layer_inputs, num_frames);
Eigen::MatrixXf& rechannel_output = _rechannel.GetOutput();
NAM_PROFILE_ADD(rechannel);

// Process layers
for (size_t i = 0; i < this->_layers.size(); i++)
Expand Down Expand Up @@ -329,7 +348,11 @@ void nam::wavenet::_LayerArray::ProcessInner(const Eigen::MatrixXf& layer_inputs
this->_layers[last_layer].GetOutputNextLayer().leftCols(num_frames);

// Process head rechannel
#ifdef NAM_PROFILING
_prof_start = nam::profiling::get_time_us(); // Reset timer for accurate head_rechannel measurement
#endif
_head_rechannel.process_(this->_head_inputs, num_frames);
NAM_PROFILE_ADD(rechannel);
}


Expand Down
4 changes: 3 additions & 1 deletion tools/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ add_executable(run_tests run_tests.cpp test/allocation_tracking.cpp ${NAM_SOURCE
# Compile run_tests without optimizations to ensure allocation tracking works correctly
# Also ensure assertions are enabled (NDEBUG is not defined) so tests actually run
set_target_properties(run_tests PROPERTIES COMPILE_OPTIONS "-O0")
# Benchmodel should be built with NAM_PROFILING set
target_compile_definitions(benchmodel PRIVATE NAM_PROFILING)
# Ensure assertions are enabled for run_tests by removing NDEBUG if it was set
# Release/RelWithDebInfo/MinSizeRel build types automatically define NDEBUG
# We use a compile option to undefine it, which works on GCC, Clang, and MSVC
Expand Down Expand Up @@ -61,4 +63,4 @@ endif()
# /Users/steve/src/NeuralAmpModelerCore/Dependencies/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h
# Don't let this break my build on debug:
set_source_files_properties(../NAM/dsp.cpp PROPERTIES COMPILE_FLAGS "-Wno-error")
set_source_files_properties(../NAM/conv1d.cpp PROPERTIES COMPILE_FLAGS "-Wno-error")
set_source_files_properties(../NAM/conv1d.cpp PROPERTIES COMPILE_FLAGS "-Wno-error")
Loading