From fcaaf19b74405d32e6b3777a2d3ec16e25f3ea55 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Wed, 21 Jan 2026 08:20:13 +0000
Subject: [PATCH] Parallelize CustomWaveform and stbi_write_png using OpenMP

Implemented OpenMP parallelization for:
1. CustomWaveform::Draw: Added a parallel path for waveforms without per-point code, replicating the default vertex generation logic to avoid shared state contention.
2. CustomWaveform::SmoothWave: Parallelized the smoothing loop by calculating dependency indices locally.
3. stbi_write_png (SOIL2): Parallelized the PNG line filtering loop, ensuring thread-local scratch buffers.

Also updated vendor/SOIL2/CMakeLists.txt to link OpenMP::OpenMP_C when enabled.
---
 .../MilkdropPreset/CustomWaveform.cpp         | 71 ++++++++++++++-----
 vendor/SOIL2/CMakeLists.txt                   |  5 ++
 vendor/SOIL2/src/SOIL2/stb_image_write.h      | 53 ++++++++++++++
 3 files changed, 112 insertions(+), 17 deletions(-)
diff --git a/src/libprojectM/MilkdropPreset/CustomWaveform.cpp b/src/libprojectM/MilkdropPreset/CustomWaveform.cpp
index 3ca7bbe75..323c1405f 100644
--- a/src/libprojectM/MilkdropPreset/CustomWaveform.cpp
+++ b/src/libprojectM/MilkdropPreset/CustomWaveform.cpp
@@ -8,6 +8,10 @@
 #include <algorithm>
 #include <cmath>
 
+#ifdef PRJM_ENABLE_OPENMP
+#include <omp.h>
+#endif
+
 namespace libprojectM {
 namespace MilkdropPreset {
 
@@ -141,20 +145,49 @@ void CustomWaveform::Draw(const PerFrameContext& presetPerFrameContext)
     std::vector<Renderer::Color> colors(sampleCount);
 
     float const sampleMultiplicator = sampleCount > 1 ? 1.0f / static_cast<float>(sampleCount - 1) : 0.0f;
-    for (int sample = 0; sample < sampleCount; sample++)
+
+    if (!m_perPointContext.perPointCodeHandle)
+    {
+        const float invAspectX = m_presetState.renderContext.invAspectX;
+        const float invAspectY = m_presetState.renderContext.invAspectY;
+        const float r = static_cast<float>(*m_perFrameContext.r);
+        const float g = static_cast<float>(*m_perFrameContext.g);
+        const float b = static_cast<float>(*m_perFrameContext.b);
+        const float a = static_cast<float>(*m_perFrameContext.a);
+
+#ifdef PRJM_ENABLE_OPENMP
+#pragma omp parallel for schedule(static)
+#endif
+        for (int sample = 0; sample < sampleCount; sample++)
+        {
+            float const value1 = sampleDataL[sample];
+            float const value2 = sampleDataR[sample];
+
+            // x = 0.5 + value1, y = 0.5 + value2
+            // x' = (x * 2.0 - 1.0) * invAspectX = ((0.5 + value1) * 2.0 - 1.0) * invAspectX = 2.0 * value1 * invAspectX
+            // y' = (y * -2.0 + 1.0) * invAspectY = ((0.5 + value2) * -2.0 + 1.0) * invAspectY = -2.0 * value2 * invAspectY
+
+            points[sample] = Renderer::Point(value1 * 2.0f * invAspectX, value2 * -2.0f * invAspectY);
+            colors[sample] = Renderer::Color::Modulo(Renderer::Color(r, g, b, a));
+        }
+    }
+    else
     {
-        float const sampleIndex = static_cast<float>(sample) * sampleMultiplicator;
-        LoadPerPointEvaluationVariables(sampleIndex, sampleDataL[sample], sampleDataR[sample]);
+        for (int sample = 0; sample < sampleCount; sample++)
+        {
+            float const sampleIndex = static_cast<float>(sample) * sampleMultiplicator;
+            LoadPerPointEvaluationVariables(sampleIndex, sampleDataL[sample], sampleDataR[sample]);
 
-        m_perPointContext.ExecutePerPointCode();
+            m_perPointContext.ExecutePerPointCode();
 
-        points[sample] = Renderer::Point(static_cast<float>((*m_perPointContext.x * 2.0 - 1.0) * m_presetState.renderContext.invAspectX),
-                                         static_cast<float>((*m_perPointContext.y * -2.0 + 1.0) * m_presetState.renderContext.invAspectY));
+            points[sample] = Renderer::Point(static_cast<float>((*m_perPointContext.x * 2.0 - 1.0) * m_presetState.renderContext.invAspectX),
+                                             static_cast<float>((*m_perPointContext.y * -2.0 + 1.0) * m_presetState.renderContext.invAspectY));
 
-        colors[sample] = Renderer::Color::Modulo(Renderer::Color(static_cast<float>(*m_perPointContext.r),
-                                                                 static_cast<float>(*m_perPointContext.g),
-                                                                 static_cast<float>(*m_perPointContext.b),
-                                                                 static_cast<float>(*m_perPointContext.a)));
+            colors[sample] = Renderer::Color::Modulo(Renderer::Color(static_cast<float>(*m_perPointContext.r),
+                                                                     static_cast<float>(*m_perPointContext.g),
+                                                                     static_cast<float>(*m_perPointContext.b),
+                                                                     static_cast<float>(*m_perPointContext.a)));
+        }
     }
 
     SmoothWave(points, colors);
@@ -270,18 +303,21 @@ void CustomWaveform::SmoothWave(const std::vector<Renderer::Point>& points, cons
     constexpr float inverseSum{1.0f / (c1 + c2 + c3 + c4)};
 
     size_t outputIndex = 0;
-    size_t iBelow = 0;
-    size_t iAbove2 = 1;
-
     size_t vertexCount = points.size();
 
     auto& outVertices = m_mesh.Vertices();
     auto& outColors = m_mesh.Colors();
 
+#ifdef PRJM_ENABLE_OPENMP
+#pragma omp parallel for schedule(static)
+#endif
     for (size_t inputIndex = 0; inputIndex < vertexCount - 1; inputIndex++)
     {
-        size_t const iAbove = iAbove2;
-        iAbove2 = std::min(vertexCount - 1, inputIndex + 2);
+        size_t const outputIndex = inputIndex * 2;
+        size_t const iBelow = (inputIndex == 0) ? 0 : inputIndex - 1;
+        size_t const iAbove = std::min(vertexCount - 1, inputIndex + 1);
+        size_t const iAbove2 = std::min(vertexCount - 1, inputIndex + 2);
+
         outVertices[outputIndex] = points[inputIndex];
         outColors[outputIndex] = colors[inputIndex];
         outColors[outputIndex + 1] = colors[inputIndex];
@@ -289,10 +325,11 @@ void CustomWaveform::SmoothWave(const std::vector<Renderer::Point>& points, cons
         smoothedPoint = points[inputIndex];
         smoothedPoint.SetX((c1 * points[iBelow].X() + c2 * points[inputIndex].X() + c3 * points[iAbove].X() + c4 * points[iAbove2].X()) * inverseSum);
         smoothedPoint.SetY((c1 * points[iBelow].Y() + c2 * points[inputIndex].Y() + c3 * points[iAbove].Y() + c4 * points[iAbove2].Y()) * inverseSum);
-        iBelow = inputIndex;
-        outputIndex += 2;
     }
 
+    // Set outputIndex to the end for the final point assignment
+    outputIndex = (vertexCount - 1) * 2;
+
     outVertices[outputIndex] = points[vertexCount - 1];
     outColors[outputIndex] = colors[vertexCount - 1];
 
diff --git a/vendor/SOIL2/CMakeLists.txt b/vendor/SOIL2/CMakeLists.txt
index f26642eab..ef01c2f71 100644
--- a/vendor/SOIL2/CMakeLists.txt
+++ b/vendor/SOIL2/CMakeLists.txt
@@ -36,6 +36,11 @@ target_link_libraries(SOIL2
         ${PROJECTM_OPENGL_LIBRARIES}
         )
 
+if(ENABLE_OPENMP AND OpenMP_C_FOUND)
+    target_link_libraries(SOIL2 PRIVATE OpenMP::OpenMP_C)
+    target_compile_definitions(SOIL2 PRIVATE PRJM_ENABLE_OPENMP)
+endif()
+
 if(USE_GLES)
     target_compile_definitions(SOIL2
             PRIVATE
diff --git a/vendor/SOIL2/src/SOIL2/stb_image_write.h b/vendor/SOIL2/src/SOIL2/stb_image_write.h
index ab2de5c20..8a8ea82c6 100644
--- a/vendor/SOIL2/src/SOIL2/stb_image_write.h
+++ b/vendor/SOIL2/src/SOIL2/stb_image_write.h
@@ -203,6 +203,10 @@ STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
 
 #ifdef STB_IMAGE_WRITE_IMPLEMENTATION
 
+#ifdef PRJM_ENABLE_OPENMP
+#include <omp.h>
+#endif
+
 #ifdef _WIN32
    #ifndef _CRT_SECURE_NO_WARNINGS
    #define _CRT_SECURE_NO_WARNINGS
@@ -1147,6 +1151,54 @@ STBIWDEF unsigned char *stbi_write_png_to_mem(const unsigned char *pixels, int s
    }
 
    filt = (unsigned char *) STBIW_MALLOC((x*n+1) * y); if (!filt) return 0;
+#ifdef PRJM_ENABLE_OPENMP
+   {
+       int success = 1;
+#pragma omp parallel shared(success)
+       {
+           signed char *line_buffer = (signed char *) STBIW_MALLOC(x * n);
+           if (!line_buffer) {
+#pragma omp atomic write
+               success = 0;
+           } else {
+#pragma omp for
+               for (j=0; j < y; ++j) {
+                   if (success) {
+                       int filter_type;
+                       if (force_filter > -1) {
+                           filter_type = force_filter;
+                           stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, force_filter, line_buffer);
+                       } else { // Estimate the best filter by running through all of them:
+                           int best_filter = 0, best_filter_val = 0x7fffffff, est, i;
+                           for (filter_type = 0; filter_type < 5; filter_type++) {
+                               stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, filter_type, line_buffer);
+
+                               // Estimate the entropy of the line using this filter; the less, the better.
+                               est = 0;
+                               for (i = 0; i < x*n; ++i) {
+                                   est += abs((signed char) line_buffer[i]);
+                               }
+                               if (est < best_filter_val) {
+                                   best_filter_val = est;
+                                   best_filter = filter_type;
+                               }
+                           }
+                           if (filter_type != best_filter) {  // If the last iteration already got us the best filter, don't redo it
+                               stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, best_filter, line_buffer);
+                               filter_type = best_filter;
+                           }
+                       }
+                       // when we get here, filter_type contains the filter type, and line_buffer contains the data
+                       filt[j*(x*n+1)] = (unsigned char) filter_type;
+                       STBIW_MEMMOVE(filt+j*(x*n+1)+1, line_buffer, x*n);
+                   }
+               }
+               STBIW_FREE(line_buffer);
+           }
+       }
+       if (!success) { STBIW_FREE(filt); return 0; }
+   }
+#else
    line_buffer = (signed char *) STBIW_MALLOC(x * n); if (!line_buffer) { STBIW_FREE(filt); return 0; }
    for (j=0; j < y; ++j) {
       int filter_type;
@@ -1178,6 +1230,7 @@ STBIWDEF unsigned char *stbi_write_png_to_mem(const unsigned char *pixels, int s
       STBIW_MEMMOVE(filt+j*(x*n+1)+1, line_buffer, x*n);
    }
    STBIW_FREE(line_buffer);
+#endif
    zlib = stbi_zlib_compress(filt, y*( x*n+1), &zlen, stbi_write_png_compression_level);
    STBIW_FREE(filt);
    if (!zlib) return 0;