diff --git a/benchmark/bench_fftconv.cpp b/benchmark/bench_fftconv.cpp index 44dc529..751cc27 100644 --- a/benchmark/bench_fftconv.cpp +++ b/benchmark/bench_fftconv.cpp @@ -98,6 +98,113 @@ template void BM_arma_conv_same(benchmark::State &state) { BENCHMARK(BM_arma_conv_same)->ArgsProduct(ARGS_FIR); BENCHMARK(BM_arma_conv_same)->ArgsProduct(ARGS_FIR); +// ========================================== +// 2D Convolution Benchmarks +// ========================================== + +// Args: {rows, cols, krows, kcols} +const std::vector> ARGS_2D{{{64, 256}, {64, 256}, {5, 15}, {5, 15}}}; + +template void BM_convolve_2d_full(benchmark::State &state) { + const auto rows = static_cast(state.range(0)); + const auto cols = static_cast(state.range(1)); + const auto krows = static_cast(state.range(2)); + const auto kcols = static_cast(state.range(3)); + const auto orows = rows + krows - 1; + const auto ocols = cols + kcols - 1; + + AlignedVector a(rows * cols); + AlignedVector k(krows * kcols); + AlignedVector out(orows * ocols); + + fftconv::convolve_fftw_2d(a, rows, cols, k, krows, kcols, out, orows, ocols); + for (auto _ : state) { + fftconv::convolve_fftw_2d(a, rows, cols, k, krows, kcols, out, orows, ocols); + } + + state.SetItemsProcessed(state.iterations() * static_cast(rows * cols)); + state.SetBytesProcessed(state.iterations() * static_cast(rows * cols) * sizeof(T)); +} +BENCHMARK(BM_convolve_2d_full)->ArgsProduct(ARGS_2D); +BENCHMARK(BM_convolve_2d_full)->ArgsProduct(ARGS_2D); + +template void BM_convolve_2d_same(benchmark::State &state) { + const auto rows = static_cast(state.range(0)); + const auto cols = static_cast(state.range(1)); + const auto krows = static_cast(state.range(2)); + const auto kcols = static_cast(state.range(3)); + + AlignedVector a(rows * cols); + AlignedVector k(krows * kcols); + AlignedVector out(rows * cols); + + fftconv::convolve_fftw_2d(a, rows, cols, k, krows, kcols, out, rows, cols); + for (auto _ : state) { + fftconv::convolve_fftw_2d(a, rows, cols, k, krows, kcols, out, rows, cols); + } + + state.SetItemsProcessed(state.iterations() * static_cast(rows * cols)); + state.SetBytesProcessed(state.iterations() * static_cast(rows * cols) * sizeof(T)); +} +BENCHMARK(BM_convolve_2d_same)->ArgsProduct(ARGS_2D); +BENCHMARK(BM_convolve_2d_same)->ArgsProduct(ARGS_2D); + +// ========================================== +// 3D Convolution Benchmarks +// ========================================== + +// Args: {depth, rows, cols, kdepth, krows, kcols} +const std::vector> ARGS_3D{{{16, 64}, {16, 64}, {16, 64}, {3}, {3}, {3}}}; + +template void BM_convolve_3d_full(benchmark::State &state) { + const auto depth = static_cast(state.range(0)); + const auto rows = static_cast(state.range(1)); + const auto cols = static_cast(state.range(2)); + const auto kdepth = static_cast(state.range(3)); + const auto krows = static_cast(state.range(4)); + const auto kcols = static_cast(state.range(5)); + const auto odepth = depth + kdepth - 1; + const auto orows = rows + krows - 1; + const auto ocols = cols + kcols - 1; + + AlignedVector a(depth * rows * cols); + AlignedVector k(kdepth * krows * kcols); + AlignedVector out(odepth * orows * ocols); + + fftconv::convolve_fftw_3d(a, depth, rows, cols, k, kdepth, krows, kcols, out, odepth, orows, ocols); + for (auto _ : state) { + fftconv::convolve_fftw_3d(a, depth, rows, cols, k, kdepth, krows, kcols, out, odepth, orows, ocols); + } + + state.SetItemsProcessed(state.iterations() * static_cast(depth * rows * cols)); + state.SetBytesProcessed(state.iterations() * static_cast(depth * rows * cols) * sizeof(T)); +} +BENCHMARK(BM_convolve_3d_full)->ArgsProduct(ARGS_3D); +BENCHMARK(BM_convolve_3d_full)->ArgsProduct(ARGS_3D); + +template void BM_convolve_3d_same(benchmark::State &state) { + const auto depth = static_cast(state.range(0)); + const auto rows = static_cast(state.range(1)); + const auto cols = static_cast(state.range(2)); + const auto kdepth = static_cast(state.range(3)); + const auto krows = static_cast(state.range(4)); + const auto kcols = static_cast(state.range(5)); + + AlignedVector a(depth * rows * cols); + AlignedVector k(kdepth * krows * kcols); + AlignedVector out(depth * rows * cols); + + fftconv::convolve_fftw_3d(a, depth, rows, cols, k, kdepth, krows, kcols, out, depth, rows, cols); + for (auto _ : state) { + fftconv::convolve_fftw_3d(a, depth, rows, cols, k, kdepth, krows, kcols, out, depth, rows, cols); + } + + state.SetItemsProcessed(state.iterations() * static_cast(depth * rows * cols)); + state.SetBytesProcessed(state.iterations() * static_cast(depth * rows * cols) * sizeof(T)); +} +BENCHMARK(BM_convolve_3d_same)->ArgsProduct(ARGS_3D); +BENCHMARK(BM_convolve_3d_same)->ArgsProduct(ARGS_3D); + // NOLINTEND(*-identifier-length) // BENCHMARK_MAIN(); diff --git a/include/fftconv/fftconv.hpp b/include/fftconv/fftconv.hpp index dc3e923..76f9c56 100644 --- a/include/fftconv/fftconv.hpp +++ b/include/fftconv/fftconv.hpp @@ -6,8 +6,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -17,6 +19,20 @@ #define FFTCONV_VERSION "0.5.1" +// Hash specialization for std::array to use as unordered_map key +namespace std { +template +struct hash> { + size_t operator()(const std::array& arr) const noexcept { + size_t seed = 0; + for (const auto& val : arr) { + seed ^= hash{}(val) + 0x9e3779b9 + (seed << 6) + (seed >> 2); + } + return seed; + } +}; +} // namespace std + namespace fftconv { using fftw::Floating; using fftw::Plan; @@ -479,6 +495,376 @@ void oaconvolve_fftw(std::span input, std::span kernel, plan.template oaconvolve(input, kernel, output); } +// ============================================ +// Multi-Dimensional (ND) Convolution Support +// ============================================ + +namespace internal_nd { + +// Helper to compute total number of elements from dimensions +template +inline size_t total_size(const std::array& dims) { + size_t total = 1; + for (size_t d : dims) total *= d; + return total; +} + +// Compute linear index from row-major coordinates +template +inline size_t linear_index(const std::array& coords, + const std::array& dims) { + size_t idx = 0; + size_t stride = 1; + for (int i = static_cast(Rank) - 1; i >= 0; --i) { + idx += coords[i] * stride; + stride *= dims[i]; + } + return idx; +} + +// Copy N-dimensional data with zero-padding +template +void copy_to_padded_buffer_nd(const std::span src, + const std::array& src_dims, + std::span dst, + const std::array& dst_dims) { + std::fill(dst.begin(), dst.end(), static_cast(0)); + + // Copy row by row for efficiency + if constexpr (Rank == 1) { + std::copy(src.begin(), src.end(), dst.begin()); + } else if constexpr (Rank == 2) { + size_t src_rows = src_dims[0]; + size_t src_cols = src_dims[1]; + size_t dst_cols = dst_dims[1]; + + for (size_t r = 0; r < src_rows; ++r) { + const T* src_row = src.data() + r * src_cols; + T* dst_row = dst.data() + r * dst_cols; + std::copy(src_row, src_row + src_cols, dst_row); + } + } else if constexpr (Rank == 3) { + size_t src_depth = src_dims[0]; + size_t src_rows = src_dims[1]; + size_t src_cols = src_dims[2]; + size_t dst_rows = dst_dims[1]; + size_t dst_cols = dst_dims[2]; + + for (size_t d = 0; d < src_depth; ++d) { + for (size_t r = 0; r < src_rows; ++r) { + const T* src_slice = src.data() + d * src_rows * src_cols + r * src_cols; + T* dst_slice = dst.data() + d * dst_rows * dst_cols + r * dst_cols; + std::copy(src_slice, src_slice + src_cols, dst_slice); + } + } + } +} + +// Extract "Same" region from full convolution result +template +void extract_same_region(const std::span full, + const std::array& full_dims, + std::span same, + const std::array& same_dims, + const std::array& kernel_dims) { + // Compute padding for each dimension + std::array padding; + for (size_t i = 0; i < Rank; ++i) { + padding[i] = kernel_dims[i] / 2; + } + + if constexpr (Rank == 1) { + std::copy(full.begin() + padding[0], + full.begin() + padding[0] + same_dims[0], + same.begin()); + } else if constexpr (Rank == 2) { + size_t full_cols = full_dims[1]; + size_t same_cols = same_dims[1]; + size_t same_rows = same_dims[0]; + + for (size_t r = 0; r < same_rows; ++r) { + const T* full_row = full.data() + (r + padding[0]) * full_cols + padding[1]; + T* same_row = same.data() + r * same_cols; + std::copy(full_row, full_row + same_cols, same_row); + } + } else if constexpr (Rank == 3) { + size_t full_rows = full_dims[1]; + size_t full_cols = full_dims[2]; + size_t same_depth = same_dims[0]; + size_t same_rows = same_dims[1]; + size_t same_cols = same_dims[2]; + + for (size_t d = 0; d < same_depth; ++d) { + for (size_t r = 0; r < same_rows; ++r) { + const T* full_slice = full.data() + + (d + padding[0]) * full_rows * full_cols + + (r + padding[1]) * full_cols + padding[2]; + T* same_slice = same.data() + + d * same_rows * same_cols + r * same_cols; + std::copy(full_slice, full_slice + same_cols, same_slice); + } + } + } +} + +// Copy kernel to padded buffer (for ND) - specialized for 2D and 3D +template +void copy_kernel_to_padded_buffer_nd(const std::span src, + const std::array& src_dims, + std::span dst, + const std::array& dst_dims) { + std::fill(dst.begin(), dst.end(), static_cast(0)); + + // For convolution, we need to place the kernel in the top-left + // (FFT convolution uses circular convolution) + if constexpr (Rank == 1) { + std::copy(src.begin(), src.end(), dst.begin()); + } else if constexpr (Rank == 2) { + size_t src_rows = src_dims[0]; + size_t src_cols = src_dims[1]; + size_t dst_cols = dst_dims[1]; + + for (size_t r = 0; r < src_rows; ++r) { + const T* src_row = src.data() + r * src_cols; + T* dst_row = dst.data() + r * dst_cols; + std::copy(src_row, src_row + src_cols, dst_row); + } + } else if constexpr (Rank == 3) { + size_t src_depth = src_dims[0]; + size_t src_rows = src_dims[1]; + size_t src_cols = src_dims[2]; + size_t dst_rows = dst_dims[1]; + size_t dst_cols = dst_dims[2]; + + for (size_t d = 0; d < src_depth; ++d) { + for (size_t r = 0; r < src_rows; ++r) { + const T* src_slice = src.data() + d * src_rows * src_cols + r * src_cols; + T* dst_slice = dst.data() + d * dst_rows * dst_cols + r * dst_cols; + std::copy(src_slice, src_slice + src_cols, dst_slice); + } + } + } +} + +// Compute complex output size for R2C FFT in ND +template +inline size_t complex_output_size(const std::array& real_dims) { + size_t size = 1; + for (size_t i = 0; i < Rank - 1; ++i) { + size *= real_dims[i]; + } + size *= (real_dims[Rank - 1] / 2 + 1); + return size; +} + +} // namespace internal_nd + +// Cache mixin for multi-dimensional keys +template +struct cache_mixin_nd { + using Key = std::array; + + static auto get(Key dims) -> Child& { + thread_local std::unordered_map> cache; + + auto& val = cache[dims]; + if (val == nullptr) { + val = std::make_unique(dims); + } + return *val; + } +}; + +// Multi-dimensional FFT convolution buffer +template +struct FFTConvBufferND { + using Cx = std::complex; + + std::array padded_dims; + AlignedVector real; + AlignedVector cx1; + AlignedVector cx2; + + explicit FFTConvBufferND(const std::array& pdims) + : padded_dims(pdims), + real(internal_nd::total_size(pdims)), + cx1(internal_nd::complex_output_size(pdims)), + cx2(internal_nd::complex_output_size(pdims)) {} + + auto real_ptr() -> T* { return real.data(); } + auto cx1_ptr() -> fftw::Complex* { + return reinterpret_cast*>(cx1.data()); + } + auto cx2_ptr() -> fftw::Complex* { + return reinterpret_cast*>(cx2.data()); + } + + auto real_span() -> std::span { return real; } + auto cx1_span() -> std::span { return cx1; } + auto cx2_span() -> std::span { return cx2; } +}; + +// Helper to create FFTW R2C plan for ND +template +auto create_r2c_plan(const std::array& dims, + T* in, fftw::Complex* out, + unsigned int flags) -> fftw::Plan { + if constexpr (Rank == 2) { + return fftw::Plan::dft_r2c_2d( + static_cast(dims[0]), static_cast(dims[1]), + in, out, flags); + } else if constexpr (Rank == 3) { + return fftw::Plan::dft_r2c_3d( + static_cast(dims[0]), static_cast(dims[1]), static_cast(dims[2]), + in, out, flags); + } else { + static_assert(Rank == 2 || Rank == 3, "Only 2D and 3D supported"); + return {}; + } +} + +// Helper to create FFTW C2R plan for ND +template +auto create_c2r_plan(const std::array& dims, + fftw::Complex* in, T* out, + unsigned int flags) -> fftw::Plan { + if constexpr (Rank == 2) { + return fftw::Plan::dft_c2r_2d( + static_cast(dims[0]), static_cast(dims[1]), + in, out, flags); + } else if constexpr (Rank == 3) { + return fftw::Plan::dft_c2r_3d( + static_cast(dims[0]), static_cast(dims[1]), static_cast(dims[2]), + in, out, flags); + } else { + static_assert(Rank == 2 || Rank == 3, "Only 2D and 3D supported"); + return {}; + } +} + +// Multi-dimensional FFT convolution engine +template +struct FFTConvEngineND : public cache_mixin_nd, Rank> { + using Plan = fftw::Plan; + using Cx = fftw::Complex; + using Key = std::array; + + FFTConvBufferND buf; + Plan forward; + Plan backward; + + explicit FFTConvEngineND(Key padded_dims) + : buf(padded_dims), + forward(create_r2c_plan(padded_dims, buf.real_ptr(), buf.cx1_ptr(), PlannerFlag)), + backward(create_c2r_plan(padded_dims, buf.cx1_ptr(), buf.real_ptr(), PlannerFlag)) {} + + template + void convolve(const std::span input, const Key& input_dims, + const std::span kernel, const Key& kernel_dims, + std::span output, const Key& output_dims) { + std::fill(output.begin(), output.end(), static_cast(0)); + + // Forward FFT of input + internal_nd::copy_to_padded_buffer_nd(input, input_dims, buf.real, buf.padded_dims); + forward.execute_dft_r2c(buf.real_ptr(), buf.cx1_ptr()); + + // Forward FFT of kernel + internal_nd::copy_kernel_to_padded_buffer_nd(kernel, kernel_dims, buf.real, buf.padded_dims); + forward.execute_dft_r2c(buf.real_ptr(), buf.cx2_ptr()); + + // Complex element-wise multiplication + internal::multiply_cx(buf.cx1, buf.cx2, buf.cx1); + + // Inverse FFT + const T fct = static_cast(1.0) / internal_nd::total_size(buf.padded_dims); + + if constexpr (Mode == ConvMode::Full) { + backward.execute_dft_c2r(buf.cx1_ptr(), buf.real_ptr()); + fftw::normalize(buf.real, fct); + + // Copy the full result + const size_t full_size = internal_nd::total_size(output_dims); + std::copy(buf.real.begin(), buf.real.begin() + full_size, output.begin()); + + } else if constexpr (Mode == ConvMode::Same) { + backward.execute_dft_c2r(buf.cx1_ptr(), buf.real_ptr()); + fftw::normalize(buf.real, fct); + + // Extract the "same" region + internal_nd::extract_same_region(buf.real, buf.padded_dims, + output, output_dims, kernel_dims); + } + } +}; + +// ========================================== +// 2D Convolution Public API +// ========================================== + +template +void convolve_fftw_2d(const std::span input, size_t rows, size_t cols, + const std::span kernel, size_t krows, size_t kcols, + std::span output, size_t orows, size_t ocols) { + const std::array input_dims = {rows, cols}; + const std::array kernel_dims = {krows, kcols}; + const std::array output_dims = {orows, ocols}; + std::array padded_dims; + + for (size_t i = 0; i < 2; ++i) { + padded_dims[i] = input_dims[i] + kernel_dims[i] - 1; + } + + auto& engine = FFTConvEngineND::get(padded_dims); + engine.template convolve(input, input_dims, kernel, kernel_dims, output, output_dims); +} + +// Convenience overloads for 2D +template +void convolve_fftw_2d(const std::span input, const std::array& input_dims, + const std::span kernel, const std::array& kernel_dims, + std::span output, const std::array& output_dims) { + convolve_fftw_2d( + input, input_dims[0], input_dims[1], + kernel, kernel_dims[0], kernel_dims[1], + output, output_dims[0], output_dims[1]); +} + +// ========================================== +// 3D Convolution Public API +// ========================================== + +template +void convolve_fftw_3d(const std::span input, size_t depth, size_t rows, size_t cols, + const std::span kernel, size_t kdepth, size_t krows, size_t kcols, + std::span output, size_t odepth, size_t orows, size_t ocols) { + const std::array input_dims = {depth, rows, cols}; + const std::array kernel_dims = {kdepth, krows, kcols}; + const std::array output_dims = {odepth, orows, ocols}; + std::array padded_dims; + + for (size_t i = 0; i < 3; ++i) { + padded_dims[i] = input_dims[i] + kernel_dims[i] - 1; + } + + auto& engine = FFTConvEngineND::get(padded_dims); + engine.template convolve(input, input_dims, kernel, kernel_dims, output, output_dims); +} + +// Convenience overloads for 3D +template +void convolve_fftw_3d(const std::span input, const std::array& input_dims, + const std::span kernel, const std::array& kernel_dims, + std::span output, const std::array& output_dims) { + convolve_fftw_3d( + input, input_dims[0], input_dims[1], input_dims[2], + kernel, kernel_dims[0], kernel_dims[1], kernel_dims[2], + output, output_dims[0], output_dims[1], output_dims[2]); +} + } // namespace fftconv // NOLINTEND(*-reinterpret-cast, *-const-cast, *-pointer-arithmetic) diff --git a/py/bench/bench_nd.py b/py/bench/bench_nd.py new file mode 100644 index 0000000..c4e0373 --- /dev/null +++ b/py/bench/bench_nd.py @@ -0,0 +1,108 @@ +"""Benchmark pyfftconv 2D/3D convolution against numpy and scipy.""" + +import time +import numpy as np +from scipy.signal import fftconvolve +from pyfftconv import convolve, convolve2d, convolve3d + + +def bench(fn, *args, n_warmup=3, n_iter=20): + """Run fn(*args) n_iter times and return median time in microseconds.""" + for _ in range(n_warmup): + fn(*args) + times = [] + for _ in range(n_iter): + t0 = time.perf_counter() + fn(*args) + t1 = time.perf_counter() + times.append((t1 - t0) * 1e6) + times.sort() + return times[len(times) // 2] + + +def fmt(us): + if us >= 1e6: + return f"{us / 1e6:8.2f} s " + if us >= 1e3: + return f"{us / 1e3:8.2f} ms" + return f"{us:8.2f} us" + + +def print_row(label, t_fftconv, t_numpy=None, t_scipy=None): + parts = [f" {label:<40s} {fmt(t_fftconv)}"] + if t_numpy is not None: + speedup = t_numpy / t_fftconv + parts.append(f" {fmt(t_numpy)} ({speedup:5.2f}x)") + if t_scipy is not None: + speedup = t_scipy / t_fftconv + parts.append(f" {fmt(t_scipy)} ({speedup:5.2f}x)") + print("".join(parts)) + + +def main(): + np.random.seed(0) + + # ── 1D benchmarks ── + print("=" * 100) + print(f" {'1D Convolution':<40s} {'pyfftconv':>11s} {'np.convolve':>11s} {'ratio':>8s} {'scipy fftconv':>13s} {'ratio':>8s}") + print("-" * 100) + + for n, k in [(2304, 165), (4352, 165), (8192, 255), (65536, 1025)]: + a = np.random.randn(n) + kernel = np.random.randn(k) + + for mode in ("full", "same"): + t_fc = bench(convolve, a, kernel, mode) + t_np = bench(np.convolve, a, kernel, mode) + t_sp = bench(fftconvolve, a, kernel, mode) + label = f"[{n}] * [{k}] mode={mode}" + print_row(label, t_fc, t_np, t_sp) + + # ── 2D benchmarks ── + print() + print("=" * 100) + print(f" {'2D Convolution':<40s} {'pyfftconv':>11s} {'scipy fftconv':>13s} {'ratio':>8s}") + print("-" * 100) + + for (nr, nc), (kr, kc) in [ + ((64, 64), (5, 5)), + ((64, 64), (15, 15)), + ((256, 256), (5, 5)), + ((256, 256), (15, 15)), + ((512, 512), (15, 15)), + ]: + a = np.random.randn(nr, nc) + kernel = np.random.randn(kr, kc) + + for mode in ("full", "same"): + t_fc = bench(convolve2d, a, kernel, mode) + t_sp = bench(fftconvolve, a, kernel, mode) + label = f"[{nr}x{nc}] * [{kr}x{kc}] mode={mode}" + print_row(label, t_fc, t_scipy=t_sp) + + # ── 3D benchmarks ── + print() + print("=" * 100) + print(f" {'3D Convolution':<40s} {'pyfftconv':>11s} {'scipy fftconv':>13s} {'ratio':>8s}") + print("-" * 100) + + for (d, r, c), (kd, kr, kc) in [ + ((16, 16, 16), (3, 3, 3)), + ((32, 32, 32), (3, 3, 3)), + ((64, 64, 64), (3, 3, 3)), + ((64, 64, 64), (5, 5, 5)), + ]: + a = np.random.randn(d, r, c) + kernel = np.random.randn(kd, kr, kc) + + for mode in ("full", "same"): + t_fc = bench(convolve3d, a, kernel, mode) + t_sp = bench(fftconvolve, a, kernel, mode) + label = f"[{d}x{r}x{c}] * [{kd}x{kr}x{kc}] mode={mode}" + print_row(label, t_fc, t_scipy=t_sp) + + print("=" * 100) + + +if __name__ == "__main__": + main() diff --git a/py/main.cpp b/py/main.cpp index b5b98a0..dffeec7 100644 --- a/py/main.cpp +++ b/py/main.cpp @@ -131,10 +131,141 @@ template static py::array_t hilbert(py::array_t a) { return out; } +// 2D Convolution +template +static void convolve2d_(py::array_t a, py::array_t k, py::array_t out, + const std::string &modeStr) { + py::buffer_info bufIn = a.request(); + py::buffer_info bufK = k.request(); + py::buffer_info bufOut = out.request(); + + if (bufIn.ndim != 2 || bufK.ndim != 2 || bufOut.ndim != 2) { + throw std::runtime_error("Number of dimensions must be two"); + } + + size_t rows = static_cast(bufIn.shape[0]); + size_t cols = static_cast(bufIn.shape[1]); + size_t krows = static_cast(bufK.shape[0]); + size_t kcols = static_cast(bufK.shape[1]); + size_t orows = static_cast(bufOut.shape[0]); + size_t ocols = static_cast(bufOut.shape[1]); + + const auto mode = parseConvMode(modeStr); + if (mode == ConvMode::Same) { + fftconv::convolve_fftw_2d( + as_span(a), rows, cols, as_span(k), krows, kcols, + as_mutable_span(out), orows, ocols); + } else { + fftconv::convolve_fftw_2d( + as_span(a), rows, cols, as_span(k), krows, kcols, + as_mutable_span(out), orows, ocols); + } +} + +template +static py::array_t convolve2d(py::array_t a, py::array_t k, + const std::string &modeStr) { + py::buffer_info bufIn = a.request(); + py::buffer_info bufK = k.request(); + + if (bufIn.ndim != 2 || bufK.ndim != 2) { + throw std::runtime_error("Number of dimensions must be two"); + } + + size_t rows = static_cast(bufIn.shape[0]); + size_t cols = static_cast(bufIn.shape[1]); + size_t krows = static_cast(bufK.shape[0]); + size_t kcols = static_cast(bufK.shape[1]); + + const auto mode = parseConvMode(modeStr); + py::array_t out; + + if (mode == ConvMode::Same) { + out = py::array_t({rows, cols}); + } else { + out = py::array_t({rows + krows - 1, cols + kcols - 1}); + } + + convolve2d_(a, k, out, modeStr); + return out; +} + +// 3D Convolution +template +static void convolve3d_(py::array_t a, py::array_t k, py::array_t out, + const std::string &modeStr) { + py::buffer_info bufIn = a.request(); + py::buffer_info bufK = k.request(); + py::buffer_info bufOut = out.request(); + + if (bufIn.ndim != 3 || bufK.ndim != 3 || bufOut.ndim != 3) { + throw std::runtime_error("Number of dimensions must be three"); + } + + size_t depth = static_cast(bufIn.shape[0]); + size_t rows = static_cast(bufIn.shape[1]); + size_t cols = static_cast(bufIn.shape[2]); + size_t kdepth = static_cast(bufK.shape[0]); + size_t krows = static_cast(bufK.shape[1]); + size_t kcols = static_cast(bufK.shape[2]); + size_t odepth = static_cast(bufOut.shape[0]); + size_t orows = static_cast(bufOut.shape[1]); + size_t ocols = static_cast(bufOut.shape[2]); + + const auto mode = parseConvMode(modeStr); + if (mode == ConvMode::Same) { + fftconv::convolve_fftw_3d( + as_span(a), depth, rows, cols, as_span(k), kdepth, krows, kcols, + as_mutable_span(out), odepth, orows, ocols); + } else { + fftconv::convolve_fftw_3d( + as_span(a), depth, rows, cols, as_span(k), kdepth, krows, kcols, + as_mutable_span(out), odepth, orows, ocols); + } +} + +template +static py::array_t convolve3d(py::array_t a, py::array_t k, + const std::string &modeStr) { + py::buffer_info bufIn = a.request(); + py::buffer_info bufK = k.request(); + + if (bufIn.ndim != 3 || bufK.ndim != 3) { + throw std::runtime_error("Number of dimensions must be three"); + } + + size_t depth = static_cast(bufIn.shape[0]); + size_t rows = static_cast(bufIn.shape[1]); + size_t cols = static_cast(bufIn.shape[2]); + size_t kdepth = static_cast(bufK.shape[0]); + size_t krows = static_cast(bufK.shape[1]); + size_t kcols = static_cast(bufK.shape[2]); + + const auto mode = parseConvMode(modeStr); + py::array_t out; + + if (mode == ConvMode::Same) { + out = py::array_t({depth, rows, cols}); + } else { + out = py::array_t({depth + kdepth - 1, rows + krows - 1, cols + kcols - 1}); + } + + convolve3d_(a, k, out, modeStr); + return out; +} + const char *const convolve_doc = R"delimiter( Performs convolution using FFTW. API compatible with np.convolve )delimiter"; +const char *const convolve2d_doc = R"delimiter( +Performs 2D convolution using FFTW. Similar to scipy.signal.convolve2d +)delimiter"; + +const char *const convolve3d_doc = R"delimiter( +Performs 3D convolution using FFTW. Similar to scipy.signal.convolve +)delimiter"; + const char *const oaconvolve_doc = R"delimiter( Performs overlap-add convolution using FFTW. API compatible with np.convolve )delimiter"; @@ -157,6 +288,24 @@ PYBIND11_MODULE(_pyfftconv, m) { m.def("convolve_", convolve_, py::arg("a"), py::arg("k"), py::arg("out"), py::arg("mode") = "full", convolve_doc); + m.def("convolve2d", convolve2d, py::arg("a"), py::arg("k"), + py::arg("mode") = "full", convolve2d_doc); + m.def("convolve2d", convolve2d, py::arg("a"), py::arg("k"), + py::arg("mode") = "full", convolve2d_doc); + m.def("convolve2d_", convolve2d_, py::arg("a"), py::arg("k"), + py::arg("out"), py::arg("mode") = "full", convolve2d_doc); + m.def("convolve2d_", convolve2d_, py::arg("a"), py::arg("k"), + py::arg("out"), py::arg("mode") = "full", convolve2d_doc); + + m.def("convolve3d", convolve3d, py::arg("a"), py::arg("k"), + py::arg("mode") = "full", convolve3d_doc); + m.def("convolve3d", convolve3d, py::arg("a"), py::arg("k"), + py::arg("mode") = "full", convolve3d_doc); + m.def("convolve3d_", convolve3d_, py::arg("a"), py::arg("k"), + py::arg("out"), py::arg("mode") = "full", convolve3d_doc); + m.def("convolve3d_", convolve3d_, py::arg("a"), py::arg("k"), + py::arg("out"), py::arg("mode") = "full", convolve3d_doc); + m.def("oaconvolve", oaconvolve, py::arg("a"), py::arg("k"), py::arg("mode") = "full", oaconvolve_doc); m.def("oaconvolve", oaconvolve, py::arg("a"), py::arg("k"), diff --git a/py/pyfftconv/__init__.py b/py/pyfftconv/__init__.py index fec0672..bc3d9b6 100644 --- a/py/pyfftconv/__init__.py +++ b/py/pyfftconv/__init__.py @@ -3,6 +3,10 @@ __version__, convolve, convolve_, + convolve2d, + convolve2d_, + convolve3d, + convolve3d_, oaconvolve, oaconvolve_, hilbert, diff --git a/py/test/test_fftconv2d.py b/py/test/test_fftconv2d.py new file mode 100644 index 0000000..22faf0e --- /dev/null +++ b/py/test/test_fftconv2d.py @@ -0,0 +1,69 @@ +import unittest +import numpy as np +from scipy.signal import fftconvolve +from numpy.testing import assert_allclose + +from pyfftconv import convolve2d, convolve2d_ + + +class TestConvolve2D(unittest.TestCase): + def setUp(self): + np.random.seed(42) + self.a = np.random.randn(32, 32).astype(np.float64) + self.k = np.random.randn(5, 5).astype(np.float64) + + def test_full_double(self): + result = convolve2d(self.a, self.k, mode="full") + expected = fftconvolve(self.a, self.k, mode="full") + assert_allclose(result, expected, rtol=1e-10, atol=1e-10) + + def test_same_double(self): + result = convolve2d(self.a, self.k, mode="same") + expected = fftconvolve(self.a, self.k, mode="same") + assert_allclose(result, expected, rtol=1e-10, atol=1e-10) + + def test_full_float(self): + a = self.a.astype(np.float32) + k = self.k.astype(np.float32) + result = convolve2d(a, k, mode="full") + expected = fftconvolve(a.astype(np.float64), k.astype(np.float64), mode="full") + assert_allclose(result, expected, rtol=1e-4, atol=1e-4) + + def test_same_float(self): + a = self.a.astype(np.float32) + k = self.k.astype(np.float32) + result = convolve2d(a, k, mode="same") + expected = fftconvolve(a.astype(np.float64), k.astype(np.float64), mode="same") + assert_allclose(result, expected, rtol=1e-4, atol=1e-4) + + def test_full_out_double(self): + rows, cols = self.a.shape + krows, kcols = self.k.shape + out = np.empty((rows + krows - 1, cols + kcols - 1), dtype=np.float64) + convolve2d_(self.a, self.k, out, mode="full") + expected = fftconvolve(self.a, self.k, mode="full") + assert_allclose(out, expected, rtol=1e-10, atol=1e-10) + + def test_same_out_double(self): + out = np.empty_like(self.a) + convolve2d_(self.a, self.k, out, mode="same") + expected = fftconvolve(self.a, self.k, mode="same") + assert_allclose(out, expected, rtol=1e-10, atol=1e-10) + + def test_large_input(self): + a = np.random.randn(128, 128).astype(np.float64) + k = np.random.randn(15, 15).astype(np.float64) + result = convolve2d(a, k, mode="full") + expected = fftconvolve(a, k, mode="full") + assert_allclose(result, expected, rtol=1e-9, atol=1e-9) + + def test_nonsquare(self): + a = np.random.randn(64, 32).astype(np.float64) + k = np.random.randn(7, 3).astype(np.float64) + result = convolve2d(a, k, mode="full") + expected = fftconvolve(a, k, mode="full") + assert_allclose(result, expected, rtol=1e-10, atol=1e-10) + + +if __name__ == "__main__": + unittest.main() diff --git a/py/test/test_fftconv3d.py b/py/test/test_fftconv3d.py new file mode 100644 index 0000000..67c1a54 --- /dev/null +++ b/py/test/test_fftconv3d.py @@ -0,0 +1,62 @@ +import unittest +import numpy as np +from scipy.signal import fftconvolve +from numpy.testing import assert_allclose + +from pyfftconv import convolve3d, convolve3d_ + + +class TestConvolve3D(unittest.TestCase): + def setUp(self): + np.random.seed(42) + self.a = np.random.randn(16, 16, 16).astype(np.float64) + self.k = np.random.randn(3, 3, 3).astype(np.float64) + + def test_full_double(self): + result = convolve3d(self.a, self.k, mode="full") + expected = fftconvolve(self.a, self.k, mode="full") + assert_allclose(result, expected, rtol=1e-10, atol=1e-10) + + def test_same_double(self): + result = convolve3d(self.a, self.k, mode="same") + expected = fftconvolve(self.a, self.k, mode="same") + assert_allclose(result, expected, rtol=1e-10, atol=1e-10) + + def test_full_float(self): + a = self.a.astype(np.float32) + k = self.k.astype(np.float32) + result = convolve3d(a, k, mode="full") + expected = fftconvolve(a.astype(np.float64), k.astype(np.float64), mode="full") + assert_allclose(result, expected, rtol=1e-4, atol=1e-4) + + def test_same_float(self): + a = self.a.astype(np.float32) + k = self.k.astype(np.float32) + result = convolve3d(a, k, mode="same") + expected = fftconvolve(a.astype(np.float64), k.astype(np.float64), mode="same") + assert_allclose(result, expected, rtol=1e-4, atol=1e-4) + + def test_full_out_double(self): + d, r, c = self.a.shape + kd, kr, kc = self.k.shape + out = np.empty((d + kd - 1, r + kr - 1, c + kc - 1), dtype=np.float64) + convolve3d_(self.a, self.k, out, mode="full") + expected = fftconvolve(self.a, self.k, mode="full") + assert_allclose(out, expected, rtol=1e-10, atol=1e-10) + + def test_same_out_double(self): + out = np.empty_like(self.a) + convolve3d_(self.a, self.k, out, mode="same") + expected = fftconvolve(self.a, self.k, mode="same") + assert_allclose(out, expected, rtol=1e-10, atol=1e-10) + + def test_nonsquare(self): + a = np.random.randn(12, 16, 8).astype(np.float64) + k = np.random.randn(3, 5, 3).astype(np.float64) + result = convolve3d(a, k, mode="full") + expected = fftconvolve(a, k, mode="full") + assert_allclose(result, expected, rtol=1e-10, atol=1e-10) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 468fe27..31e42d8 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -9,30 +9,42 @@ find_package(fmt CONFIG REQUIRED) find_package(FFTW3 CONFIG REQUIRED) find_package(FFTW3f CONFIG REQUIRED) +add_executable(test_script_fftconv test_script.cpp) +set_target_properties(test_script_fftconv PROPERTIES + CXX_STANDARD 20 + CXX_EXTENSIONS OFF) +target_link_libraries(test_script_fftconv + PRIVATE + FFTW3::fftw3 + FFTW3::fftw3f + fmt::fmt + armadillo +) +target_include_directories(test_script_fftconv + PRIVATE + ../include +) -function(add_exe target) - add_executable(${target} ${ARGN}) - set_target_properties(${target} PROPERTIES - CXX_STANDARD 20 - CXX_EXTENSIONS OFF) - target_link_libraries(${target} - PRIVATE - FFTW3::fftw3 - FFTW3::fftw3f - fmt::fmt - armadillo - ) - target_include_directories(${target} - PRIVATE - ../include - ) -endfunction() - -add_exe(test_script_fftconv test_script.cpp) -add_exe(test_fftconv +add_executable(test_fftconv test_fftw.cpp test_fftconv.cpp test_hilbert.cpp + test_fftconv2d.cpp + test_fftconv3d.cpp +) +set_target_properties(test_fftconv PROPERTIES + CXX_STANDARD 20 + CXX_EXTENSIONS OFF) +target_link_libraries(test_fftconv + PRIVATE + FFTW3::fftw3 + FFTW3::fftw3f + fmt::fmt + armadillo +) +target_include_directories(test_fftconv + PRIVATE + ../include ) # On Windows, linking GTest::gmock causes gtest to not discover any tests @@ -43,4 +55,4 @@ target_link_libraries(test_fftconv ) include(GoogleTest) -gtest_discover_tests(test_fftconv) \ No newline at end of file +gtest_discover_tests(test_fftconv) diff --git a/test/test_fftconv2d.cpp b/test/test_fftconv2d.cpp new file mode 100644 index 0000000..d53a49b --- /dev/null +++ b/test/test_fftconv2d.cpp @@ -0,0 +1,311 @@ +// NOLINTBEGIN(*-magic-numbers,*-array-index) +#include +#include +#include +#include +#include + +#include + +#include "test_common.hpp" + +using fftconv::ConvMode; + +// Test 2D convolution with known simple pattern +TEST(Convolve2D, SimplePattern) { + // Armadillo is column-major, our code is row-major + // So create transposed matrices for the input + arma::Mat a_t = arma::ones>(4, 4); + arma::Mat k_t = arma::ones>(2, 2); + + // Armadillo's conv2 does: conv2(a, k) = conv2(a_t, k_t).t() + // So we need to compute expected correctly by matching memory layout + // Create our own row-major inputs + const size_t rows = 4, cols = 4; + const size_t krows = 2, kcols = 2; + const size_t orows = rows + krows - 1; + const size_t ocols = cols + kcols - 1; + + std::vector a_rowmajor(rows * cols, 1.0); + std::vector k_rowmajor(krows * kcols, 1.0); + std::vector res_rowmajor(orows * ocols, 0.0); + + fftconv::convolve_fftw_2d( + std::span(a_rowmajor.data(), a_rowmajor.size()), + rows, cols, + std::span(k_rowmajor.data(), k_rowmajor.size()), + krows, kcols, + std::span(res_rowmajor.data(), res_rowmajor.size()), + orows, ocols); + + // Expected result for row-major 4x4 ones convolved with 2x2 ones + const double expected_vals[5][5] = { + {1, 2, 2, 2, 1}, + {2, 4, 4, 4, 2}, + {2, 4, 4, 4, 2}, + {2, 4, 4, 4, 2}, + {1, 2, 2, 2, 1} + }; + + for (size_t r = 0; r < orows; ++r) { + for (size_t c = 0; c < ocols; ++c) { + EXPECT_NEAR(res_rowmajor[r * ocols + c], expected_vals[r][c], 1e-9); + } + } +} + +TEST(Convolve2D, Full) { + for (int real_rows = 8; real_rows < 32; real_rows += 8) { + for (int real_cols = 8; real_cols < 32; real_cols += 8) { + // Create data in row-major order + std::vector a_rowmajor(real_rows * real_cols); + std::vector k_rowmajor(5 * 5); + + // Fill with random data + for (auto& val : a_rowmajor) val = static_cast(rand()) / RAND_MAX - 0.5; + for (auto& val : k_rowmajor) val = static_cast(rand()) / RAND_MAX - 0.5; + + // Create transposed for Armadillo (column-major) + arma::Mat a_t(real_cols, real_rows); + arma::Mat k_t(5, 5); + + for (int r = 0; r < real_rows; ++r) { + for (int c = 0; c < real_cols; ++c) { + a_t(c, r) = a_rowmajor[r * real_cols + c]; + } + } + for (int kr = 0; kr < 5; ++kr) { + for (int kc = 0; kc < 5; ++kc) { + k_t(kc, kr) = k_rowmajor[kr * 5 + kc]; + } + } + + // Compute expected using Armadillo (transpose both, convolve, transpose result) + arma::Mat expected_t = arma::conv2(a_t, k_t); + int orows = real_rows + 5 - 1; + int ocols = real_cols + 5 - 1; + std::vector res_rowmajor(orows * ocols, 0.0); + + fftconv::convolve_fftw_2d( + std::span(a_rowmajor.data(), a_rowmajor.size()), + static_cast(real_rows), + static_cast(real_cols), + std::span(k_rowmajor.data(), k_rowmajor.size()), + 5, 5, + std::span(res_rowmajor.data(), res_rowmajor.size()), + static_cast(orows), + static_cast(ocols)); + + // Convert expected to row-major + std::vector expected_rowmajor(orows * ocols); + for (int r = 0; r < orows; ++r) { + for (int c = 0; c < ocols; ++c) { + expected_rowmajor[r * ocols + c] = expected_t(c, r); + } + } + + ExpectVectorsNear( + std::span(res_rowmajor.data(), res_rowmajor.size()), + std::span(expected_rowmajor.data(), expected_rowmajor.size()), + getTol()); + } + } +} + +TEST(Convolve2D, Same) { + for (int real_rows = 8; real_rows < 32; real_rows += 8) { + for (int real_cols = 8; real_cols < 32; real_cols += 8) { + // Create data in row-major order + std::vector a_rowmajor(real_rows * real_cols); + std::vector k_rowmajor(5 * 5); + + // Fill with random data + for (auto& val : a_rowmajor) val = static_cast(rand()) / RAND_MAX - 0.5; + for (auto& val : k_rowmajor) val = static_cast(rand()) / RAND_MAX - 0.5; + + // Create transposed for Armadillo (column-major) + arma::Mat a_t(real_cols, real_rows); + arma::Mat k_t(5, 5); + + for (int r = 0; r < real_rows; ++r) { + for (int c = 0; c < real_cols; ++c) { + a_t(c, r) = a_rowmajor[r * real_cols + c]; + } + } + for (int kr = 0; kr < 5; ++kr) { + for (int kc = 0; kc < 5; ++kc) { + k_t(kc, kr) = k_rowmajor[kr * 5 + kc]; + } + } + + // Compute expected using Armadillo + arma::Mat expected_t = arma::conv2(a_t, k_t, "same"); + std::vector res_rowmajor(real_rows * real_cols, 0.0); + + fftconv::convolve_fftw_2d( + std::span(a_rowmajor.data(), a_rowmajor.size()), + static_cast(real_rows), + static_cast(real_cols), + std::span(k_rowmajor.data(), k_rowmajor.size()), + 5, 5, + std::span(res_rowmajor.data(), res_rowmajor.size()), + static_cast(real_rows), + static_cast(real_cols)); + + // Convert expected to row-major + std::vector expected_rowmajor(real_rows * real_cols); + for (int r = 0; r < real_rows; ++r) { + for (int c = 0; c < real_cols; ++c) { + expected_rowmajor[r * real_cols + c] = expected_t(c, r); + } + } + + ExpectVectorsNear( + std::span(res_rowmajor.data(), res_rowmajor.size()), + std::span(expected_rowmajor.data(), expected_rowmajor.size()), + getTol()); + } + } +} + +TEST(Convolve2D, Float) { + // Create data in row-major order + const size_t rows = 16, cols = 16; + const size_t krows = 3, kcols = 3; + + std::vector a_rowmajor(rows * cols); + std::vector k_rowmajor(krows * kcols); + + // Fill with random data + for (auto& val : a_rowmajor) val = static_cast(rand()) / RAND_MAX - 0.5f; + for (auto& val : k_rowmajor) val = static_cast(rand()) / RAND_MAX - 0.5f; + + // Create transposed for Armadillo (column-major) + arma::Mat a_t(cols, rows); + arma::Mat k_t(kcols, krows); + + for (size_t r = 0; r < rows; ++r) { + for (size_t c = 0; c < cols; ++c) { + a_t(c, r) = a_rowmajor[r * cols + c]; + } + } + for (size_t kr = 0; kr < krows; ++kr) { + for (size_t kc = 0; kc < kcols; ++kc) { + k_t(kc, kr) = k_rowmajor[kr * kcols + kc]; + } + } + + // Full mode + size_t orows = rows + krows - 1; + size_t ocols = cols + kcols - 1; + arma::Mat expected_t = arma::conv2(a_t, k_t); + std::vector res_rowmajor(orows * ocols, 0.0f); + + fftconv::convolve_fftw_2d( + std::span(a_rowmajor.data(), a_rowmajor.size()), + rows, cols, + std::span(k_rowmajor.data(), k_rowmajor.size()), + krows, kcols, + std::span(res_rowmajor.data(), res_rowmajor.size()), + orows, ocols); + + std::vector expected_rowmajor(orows * ocols); + for (size_t r = 0; r < orows; ++r) { + for (size_t c = 0; c < ocols; ++c) { + expected_rowmajor[r * ocols + c] = expected_t(c, r); + } + } + + ExpectVectorsNear( + std::span(res_rowmajor.data(), res_rowmajor.size()), + std::span(expected_rowmajor.data(), expected_rowmajor.size()), + getTol()); + + // Same mode + std::vector res_same(rows * cols, 0.0f); + arma::Mat expected_same_t = arma::conv2(a_t, k_t, "same"); + + fftconv::convolve_fftw_2d( + std::span(a_rowmajor.data(), a_rowmajor.size()), + rows, cols, + std::span(k_rowmajor.data(), k_rowmajor.size()), + krows, kcols, + std::span(res_same.data(), res_same.size()), + rows, cols); + + std::vector expected_same_rowmajor(rows * cols); + for (size_t r = 0; r < rows; ++r) { + for (size_t c = 0; c < cols; ++c) { + expected_same_rowmajor[r * cols + c] = expected_same_t(c, r); + } + } + + ExpectVectorsNear( + std::span(res_same.data(), res_same.size()), + std::span(expected_same_rowmajor.data(), expected_same_rowmajor.size()), + getTol()); +} + +// Test 2D vs 1D convolution for separable kernels +TEST(Convolve2D, SeparableKernel) { + // Create data in row-major order + const size_t rows = 32, cols = 32; + const size_t ksize = 5; + + std::vector a_rowmajor(rows * cols); + std::vector row_k(ksize); + std::vector col_k(ksize); + + // Fill with random data + for (auto& val : a_rowmajor) val = static_cast(rand()) / RAND_MAX - 0.5; + for (auto& val : row_k) val = static_cast(rand()) / RAND_MAX - 0.5; + for (auto& val : col_k) val = static_cast(rand()) / RAND_MAX - 0.5; + + // Create 2D kernel as outer product (separable) + std::vector k_rowmajor(ksize * ksize); + for (size_t kr = 0; kr < ksize; ++kr) { + for (size_t kc = 0; kc < ksize; ++kc) { + k_rowmajor[kr * ksize + kc] = col_k[kr] * row_k[kc]; + } + } + + // Create transposed for Armadillo + arma::Mat a_t(cols, rows); + arma::Mat k_t(ksize, ksize); + + for (size_t r = 0; r < rows; ++r) { + for (size_t c = 0; c < cols; ++c) { + a_t(c, r) = a_rowmajor[r * cols + c]; + } + } + for (size_t kr = 0; kr < ksize; ++kr) { + for (size_t kc = 0; kc < ksize; ++kc) { + k_t(kc, kr) = k_rowmajor[kr * ksize + kc]; + } + } + + arma::Mat expected_t = arma::conv2(a_t, k_t, "same"); + std::vector res_rowmajor(rows * cols, 0.0); + + fftconv::convolve_fftw_2d( + std::span(a_rowmajor.data(), a_rowmajor.size()), + rows, cols, + std::span(k_rowmajor.data(), k_rowmajor.size()), + ksize, ksize, + std::span(res_rowmajor.data(), res_rowmajor.size()), + rows, cols); + + std::vector expected_rowmajor(rows * cols); + for (size_t r = 0; r < rows; ++r) { + for (size_t c = 0; c < cols; ++c) { + expected_rowmajor[r * cols + c] = expected_t(c, r); + } + } + + ExpectVectorsNear( + std::span(res_rowmajor.data(), res_rowmajor.size()), + std::span(expected_rowmajor.data(), expected_rowmajor.size()), + getTol()); +} + +// NOLINTEND(*-magic-numbers,*-array-index) diff --git a/test/test_fftconv3d.cpp b/test/test_fftconv3d.cpp new file mode 100644 index 0000000..4042652 --- /dev/null +++ b/test/test_fftconv3d.cpp @@ -0,0 +1,245 @@ +// NOLINTBEGIN(*-magic-numbers,*-array-index) +#include +#include +#include +#include +#include + +#include + +#include "test_common.hpp" + +using fftconv::ConvMode; + +// Helper to compute 3D convolution manually (for testing small sizes only) - ROW-MAJOR +template +std::vector manual_conv3d_rowmajor(const std::vector& a, const std::array& a_dims, + const std::vector& b, const std::array& b_dims) { + const size_t aDepth = a_dims[0]; + const size_t aRows = a_dims[1]; + const size_t aCols = a_dims[2]; + + const size_t bDepth = b_dims[0]; + const size_t bRows = b_dims[1]; + const size_t bCols = b_dims[2]; + + const size_t outDepth = aDepth + bDepth - 1; + const size_t outRows = aRows + bRows - 1; + const size_t outCols = aCols + bCols - 1; + + std::vector result(outDepth * outRows * outCols, static_cast(0)); + + for (size_t d = 0; d < aDepth; ++d) { + for (size_t r = 0; r < aRows; ++r) { + for (size_t c = 0; c < aCols; ++c) { + const T val = a[d * aRows * aCols + r * aCols + c]; + if (val == 0) continue; + + for (size_t bd = 0; bd < bDepth; ++bd) { + for (size_t br = 0; br < bRows; ++br) { + for (size_t bc = 0; bc < bCols; ++bc) { + const size_t od = d + bd; + const size_t orow = r + br; + const size_t ocol = c + bc; + + result[od * outRows * outCols + orow * outCols + ocol] += val * b[bd * bRows * bCols + br * bCols + bc]; + } + } + } + } + } + } + + return result; +} + +// Helper to compare two row-major 3D arrays +template +void ExpectCubesNearRowMajor(const std::vector& actual, const std::array& actual_dims, + const std::vector& expected, const std::array& expected_dims, + T tolerance) { + ASSERT_EQ(actual_dims[0], expected_dims[0]); + ASSERT_EQ(actual_dims[1], expected_dims[1]); + ASSERT_EQ(actual_dims[2], expected_dims[2]); + + ExpectVectorsNear(std::span(actual.data(), actual.size()), + std::span(expected.data(), expected.size()), + tolerance); +} + +TEST(Convolve3D, SimplePattern) { + // Create data in row-major order + const std::array a_dims = {2, 2, 2}; + const std::array k_dims = {2, 2, 2}; + std::vector a_rowmajor(a_dims[0] * a_dims[1] * a_dims[2], 1.0); + std::vector k_rowmajor(k_dims[0] * k_dims[1] * k_dims[2], 1.0); + + const std::array expected_dims = { + a_dims[0] + k_dims[0] - 1, + a_dims[1] + k_dims[1] - 1, + a_dims[2] + k_dims[2] - 1 + }; + std::vector expected = manual_conv3d_rowmajor(a_rowmajor, a_dims, + k_rowmajor, k_dims); + + std::vector res(expected_dims[0] * expected_dims[1] * expected_dims[2], 0.0); + + fftconv::convolve_fftw_3d( + std::span(a_rowmajor.data(), a_rowmajor.size()), + a_dims[0], a_dims[1], a_dims[2], + std::span(k_rowmajor.data(), k_rowmajor.size()), + k_dims[0], k_dims[1], k_dims[2], + std::span(res.data(), res.size()), + expected_dims[0], expected_dims[1], expected_dims[2]); + + ExpectCubesNearRowMajor(res, expected_dims, expected, expected_dims, getTol()); +} + +TEST(Convolve3D, Full) { + for (int depth : {4, 8}) { + for (int rows : {4, 8}) { + for (int cols : {4, 8}) { + const std::array a_dims = {static_cast(depth), static_cast(rows), static_cast(cols)}; + const std::array k_dims = {3, 3, 3}; + std::vector a_rowmajor(depth * rows * cols); + std::vector k_rowmajor(3 * 3 * 3); + + // Fill with random data + for (auto& val : a_rowmajor) val = static_cast(rand()) / RAND_MAX - 0.5; + for (auto& val : k_rowmajor) val = static_cast(rand()) / RAND_MAX - 0.5; + + const std::array expected_dims = { + a_dims[0] + k_dims[0] - 1, + a_dims[1] + k_dims[1] - 1, + a_dims[2] + k_dims[2] - 1 + }; + std::vector expected = manual_conv3d_rowmajor(a_rowmajor, a_dims, + k_rowmajor, k_dims); + + std::vector res(expected_dims[0] * expected_dims[1] * expected_dims[2], 0.0); + + fftconv::convolve_fftw_3d( + std::span(a_rowmajor.data(), a_rowmajor.size()), + a_dims[0], a_dims[1], a_dims[2], + std::span(k_rowmajor.data(), k_rowmajor.size()), + k_dims[0], k_dims[1], k_dims[2], + std::span(res.data(), res.size()), + expected_dims[0], expected_dims[1], expected_dims[2]); + + ExpectCubesNearRowMajor(res, expected_dims, expected, expected_dims, getTol()); + } + } + } +} + +TEST(Convolve3D, Same) { + for (int depth : {5, 9}) { + for (int rows : {5, 9}) { + for (int cols : {5, 9}) { + const std::array a_dims = {static_cast(depth), static_cast(rows), static_cast(cols)}; + const std::array k_dims = {3, 3, 3}; + std::vector a_rowmajor(depth * rows * cols); + std::vector k_rowmajor(3 * 3 * 3); + + // Fill with random data + for (auto& val : a_rowmajor) val = static_cast(rand()) / RAND_MAX - 0.5; + for (auto& val : k_rowmajor) val = static_cast(rand()) / RAND_MAX - 0.5; + + const std::array full_dims = { + a_dims[0] + k_dims[0] - 1, + a_dims[1] + k_dims[1] - 1, + a_dims[2] + k_dims[2] - 1 + }; + + std::vector full = manual_conv3d_rowmajor(a_rowmajor, a_dims, + k_rowmajor, k_dims); + + // Extract "same" region (center) + const size_t padD = k_dims[0] / 2; + const size_t padR = k_dims[1] / 2; + const size_t padC = k_dims[2] / 2; + std::vector expected_rowmajor(a_dims[0] * a_dims[1] * a_dims[2], 0.0); + + for (size_t d = 0; d < a_dims[0]; ++d) { + for (size_t r = 0; r < a_dims[1]; ++r) { + for (size_t c = 0; c < a_dims[2]; ++c) { + const size_t full_d = d + padD; + const size_t full_r = r + padR; + const size_t full_c = c + padC; + + expected_rowmajor[d * a_dims[1] * a_dims[2] + r * a_dims[2] + c] = + full[full_d * full_dims[1] * full_dims[2] + full_r * full_dims[2] + full_c]; + } + } + } + + std::vector res(a_dims[0] * a_dims[1] * a_dims[2], 0.0); + + fftconv::convolve_fftw_3d( + std::span(a_rowmajor.data(), a_rowmajor.size()), + a_dims[0], a_dims[1], a_dims[2], + std::span(k_rowmajor.data(), k_rowmajor.size()), + k_dims[0], k_dims[1], k_dims[2], + std::span(res.data(), res.size()), + a_dims[0], a_dims[1], a_dims[2]); + + ExpectCubesNearRowMajor(res, a_dims, expected_rowmajor, a_dims, getTol()); + } + } + } +} + +TEST(Convolve3D, Float) { + const std::array a_dims = {6, 6, 6}; + const std::array k_dims = {3, 3, 3}; + std::vector a_rowmajor(a_dims[0] * a_dims[1] * a_dims[2]); + std::vector k_rowmajor(k_dims[0] * k_dims[1] * k_dims[2]); + + // Fill with random data + for (auto& val : a_rowmajor) val = static_cast(rand()) / RAND_MAX - 0.5f; + for (auto& val : k_rowmajor) val = static_cast(rand()) / RAND_MAX - 0.5f; + + const std::array expected_dims = { + a_dims[0] + k_dims[0] - 1, + a_dims[1] + k_dims[1] - 1, + a_dims[2] + k_dims[2] - 1 + }; + std::vector expected = manual_conv3d_rowmajor(a_rowmajor, a_dims, + k_rowmajor, k_dims); + + std::vector res(expected_dims[0] * expected_dims[1] * expected_dims[2], 0.0f); + + fftconv::convolve_fftw_3d( + std::span(a_rowmajor.data(), a_rowmajor.size()), + a_dims[0], a_dims[1], a_dims[2], + std::span(k_rowmajor.data(), k_rowmajor.size()), + k_dims[0], k_dims[1], k_dims[2], + std::span(res.data(), res.size()), + expected_dims[0], expected_dims[1], expected_dims[2]); + + ExpectCubesNearRowMajor(res, expected_dims, expected, expected_dims, getTol()); +} + +TEST(Convolve3D, SingleElementKernel) { + const std::array a_dims = {4, 4, 4}; + const std::array k_dims = {1, 1, 1}; + std::vector a_rowmajor(a_dims[0] * a_dims[1] * a_dims[2]); + std::vector k_rowmajor(k_dims[0] * k_dims[1] * k_dims[2], 1.0); + + // Fill with random data + for (auto& val : a_rowmajor) val = static_cast(rand()) / RAND_MAX - 0.5; + + std::vector res(a_dims[0] * a_dims[1] * a_dims[2], 0.0); + + fftconv::convolve_fftw_3d( + std::span(a_rowmajor.data(), a_rowmajor.size()), + a_dims[0], a_dims[1], a_dims[2], + std::span(k_rowmajor.data(), k_rowmajor.size()), + k_dims[0], k_dims[1], k_dims[2], + std::span(res.data(), res.size()), + a_dims[0], a_dims[1], a_dims[2]); + + ExpectCubesNearRowMajor(res, a_dims, a_rowmajor, a_dims, getTol()); +} + +// NOLINTEND(*-magic-numbers,*-array-index)