From 9f66822b3c3cc97fed1a18e8a412d6a96ec60d8b Mon Sep 17 00:00:00 2001
From: WingCode <smallstar1234@gmail.com>
Date: Mon, 9 Jun 2025 01:04:13 +0200
Subject: [PATCH 1/7] Parallelise Union-Find Decoder

---
 cpp_test/TestUnionFind.cpp                    | 12 +++++++++
 src_cpp/union_find.hpp                        | 27 +++++++++++++++----
 .../ldpc/union_find_decoder/__init__.pyi      |  2 +-
 .../_union_find_decoder.pxd                   |  5 ++--
 .../_union_find_decoder.pyx                   |  5 ++--
 5 files changed, 41 insertions(+), 10 deletions(-)
diff --git a/cpp_test/TestUnionFind.cpp b/cpp_test/TestUnionFind.cpp
index 26aad77a..e6108903 100644
--- a/cpp_test/TestUnionFind.cpp
+++ b/cpp_test/TestUnionFind.cpp
@@ -209,6 +209,18 @@ TEST(UfDecoder, peeling_with_boundaries_edge_case){
 
 }
 
+TEST(UfDecoder, parallel_peeling){
+    auto pcm = ldpc::gf2codes::ring_code<ldpc::bp::BpEntry>(10);
+    UfDecoder ufd(pcm,4);
+    std::vector<uint8_t> syndrome(pcm.m,0);
+    syndrome[0]=1;
+    syndrome[1]=1;
+    auto decoding = ufd.peel_decode(syndrome);
+    std::vector<uint8_t> expected(pcm.n,0);
+    expected[1]=1;
+    ASSERT_EQ(decoding,expected);
+}
+
 
 
 int main(int argc, char **argv)
diff --git a/src_cpp/union_find.hpp b/src_cpp/union_find.hpp
index d316c5b4..1129f764 100644
--- a/src_cpp/union_find.hpp
+++ b/src_cpp/union_find.hpp
@@ -14,6 +14,7 @@
 #include <robin_map.h>
 #include <robin_set.h>
 #include <numeric>
+#include <mutex>
 
 #include "gf2sparse_linalg.hpp"
 #include "bp.hpp"
@@ -22,6 +23,7 @@ namespace ldpc::uf {
 
     const std::vector<double> EMPTY_DOUBLE_VECTOR = {};
     tsl::robin_set<int> EMPTY_INT_ROBIN_SET = {};
+    inline std::mutex uf_global_mutex;
 
     std::vector<int> sort_indices(std::vector<double> &B) {
         std::vector<int> indices(B.size());
@@ -107,6 +109,7 @@ namespace ldpc::uf {
         }
 
         int add_bit_node_to_cluster(int bit_index) {
+            std::lock_guard<std::mutex> lk(uf_global_mutex);
             auto bit_membership = this->global_bit_membership[bit_index];
             if (bit_membership == this) return 0; //if the bit is already in the cluster terminate.
             else if (bit_membership == NULL) {
@@ -141,6 +144,7 @@ namespace ldpc::uf {
         }
 
         void merge_with_cluster(Cluster *cl2) {
+            std::lock_guard<std::mutex> lk(uf_global_mutex);
             for (auto bit_index: cl2->bit_nodes) {
                 this->bit_nodes.insert(bit_index);
                 this->global_bit_membership[bit_index] = this;
@@ -407,12 +411,14 @@ namespace ldpc::uf {
         int check_count;
         tsl::robin_set<int> planar_code_boundary_bits;
         bool pcm_max_bit_degree_2;
+        int omp_thread_count;
 
-        UfDecoder(ldpc::bp::BpSparse &parity_check_matrix) : pcm(parity_check_matrix) {
+        UfDecoder(ldpc::bp::BpSparse &parity_check_matrix, int omp_threads = 1) : pcm(parity_check_matrix) {
             this->bit_count = pcm.n;
             this->check_count = pcm.m;
             this->decoding.resize(this->bit_count);
             this->weighted = false;
+            this->omp_thread_count = omp_threads;
 
             this->pcm_max_bit_degree_2 = true;
             for (auto i = 0; i < this->pcm.n; i++) {
@@ -452,15 +458,21 @@ namespace ldpc::uf {
             }
 
             while (!invalid_clusters.empty()) {
-                for (auto cl: invalid_clusters) {
+                #pragma omp parallel for num_threads(this->omp_thread_count) schedule(dynamic)
+                for (size_t i = 0; i < invalid_clusters.size(); i++) {
+                    Cluster *cl = invalid_clusters[i];
                     if (cl->active) {
+                        #pragma omp critical
                         cl->grow_cluster(bit_weights, bits_per_step);
                     }
                 }
 
                 invalid_clusters.clear();
-                for (auto cl: clusters) {
+                #pragma omp parallel for num_threads(this->omp_thread_count)
+                for (size_t i = 0; i < clusters.size(); i++) {
+                    Cluster *cl = clusters[i];
                     if (cl->active && cl->parity() == 1 && !cl->contains_boundary_bits) {
+                        #pragma omp critical
                         invalid_clusters.push_back(cl);
                     }
                 }
@@ -468,10 +480,15 @@ namespace ldpc::uf {
                     return lhs->bit_nodes.size() < rhs->bit_nodes.size();
                 });
             }
-            for (auto cl: clusters) {
+            #pragma omp parallel for num_threads(this->omp_thread_count)
+            for (size_t i = 0; i < clusters.size(); i++) {
+                Cluster *cl = clusters[i];
                 if (cl->active) {
                     auto erasure = cl->peel_decode(syndrome);
-                    for (int bit: erasure) this->decoding[bit] = 1;
+                    for (int bit: erasure) {
+                        #pragma omp critical
+                        this->decoding[bit] = 1;
+                    }
                 }
                 delete cl;
             }
diff --git a/src_python/ldpc/union_find_decoder/__init__.pyi b/src_python/ldpc/union_find_decoder/__init__.pyi
index c341787a..6d093642 100644
--- a/src_python/ldpc/union_find_decoder/__init__.pyi
+++ b/src_python/ldpc/union_find_decoder/__init__.pyi
@@ -18,7 +18,7 @@ class UnionFindDecoder:
         Default is False.
     """
  
-    def __cinit__(self, pcm: Union[np.ndarray, spmatrix], uf_method: str = False): ...
+    def __cinit__(self, pcm: Union[np.ndarray, spmatrix], uf_method: str = False, omp_thread_count: int = 1): ...
 
     def __del__(self): ...
 
diff --git a/src_python/ldpc/union_find_decoder/_union_find_decoder.pxd b/src_python/ldpc/union_find_decoder/_union_find_decoder.pxd
index 98bad540..a7f20e38 100644
--- a/src_python/ldpc/union_find_decoder/_union_find_decoder.pxd
+++ b/src_python/ldpc/union_find_decoder/_union_find_decoder.pxd
@@ -10,7 +10,7 @@ ctypedef np.uint8_t uint8_t
 
 cdef extern from "union_find.hpp" namespace "ldpc::uf":
     cdef cppclass uf_decoder_cpp "ldpc::uf::UfDecoder":
-        uf_decoder_cpp(BpSparse& pcm) except +
+        uf_decoder_cpp(BpSparse& pcm, int omp_thread_count=1) except +
         vector[uint8_t]& peel_decode(vector[uint8_t]& syndrome, const vector[double]& bit_weights, int bits_per_step)
         vector[uint8_t]& matrix_decode(vector[uint8_t]& syndrome, const vector[double]& bit_weights, int bits_per_step)
         vector[uint8_t] decoding
@@ -26,4 +26,5 @@ cdef class UnionFindDecoder():
     cdef vector[uint8_t] _syndrome
     cdef vector[double] uf_llrs
     cdef bool uf_method
-    cdef int bits_per_step
\ No newline at end of file
+    cdef int bits_per_step
+    cdef int omp_thread_count
\ No newline at end of file
diff --git a/src_python/ldpc/union_find_decoder/_union_find_decoder.pyx b/src_python/ldpc/union_find_decoder/_union_find_decoder.pyx
index 8bb97b80..bf2bfc21 100644
--- a/src_python/ldpc/union_find_decoder/_union_find_decoder.pyx
+++ b/src_python/ldpc/union_find_decoder/_union_find_decoder.pyx
@@ -60,7 +60,7 @@ cdef class UnionFindDecoder:
         Default is False.
     """ 
  
-    def __cinit__(self, pcm: Union[np.ndarray, spmatrix], uf_method: str = False):
+    def __cinit__(self, pcm: Union[np.ndarray, spmatrix], uf_method: str = False, omp_thread_count: int = 1):
         
         self.MEMORY_ALLOCATED=False
 
@@ -75,7 +75,8 @@ cdef class UnionFindDecoder:
         # get the parity check dimensions
         self.m, self.n = pcm.shape[0], pcm.shape[1]
 
-        self.ufd = new uf_decoder_cpp(self.pcm[0])
+        self.ufd = new uf_decoder_cpp(self.pcm[0], omp_thread_count)
+        self.omp_thread_count = omp_thread_count
         self._syndrome.resize(self.m) #C vector for the syndrome
         self.uf_llrs.resize(self.n) #C vector for the log-likehood ratios
         self.uf_method = uf_method

From b8f4d8a754c9a176a94ffd4f6f327df8f902622e Mon Sep 17 00:00:00 2001
From: WingCode <smallstar1234@gmail.com>
Date: Mon, 9 Jun 2025 01:34:45 +0200
Subject: [PATCH 2/7] Parallelise Union-Find Decoder

---
 CMakeLists.txt | 5 ++++-
 setup.py       | 6 ++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 73a4eedd..f608dda1 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@ project(ldpc
     LANGUAGES CXX)
 
 # Enable OpenMP support for parallel programming
-# SET(CMAKE_CXX_FLAGS  "-fopenmp")
+find_package(OpenMP)
 
 # Ensure that the necessary C++11 features are available, as required by the RapidCSV library
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -24,6 +24,9 @@ option(BUILD_LDPC_TESTS "Also build tests for the LDPC project" ${LDPC_MASTER_PR
 
 # Define an interface library named 'ldpc' which only includes headers
 add_library(ldpc INTERFACE)
+if(OpenMP_CXX_FOUND)
+    target_link_libraries(ldpc INTERFACE OpenMP::OpenMP_CXX)
+endif()
 
 # Add specific directories to the include path for any target that links with the 'ldpc' library
 target_include_directories(ldpc INTERFACE src_cpp include/robin_map include/rapidcsv)
diff --git a/setup.py b/setup.py
index 27423240..69ad4ca2 100755
--- a/setup.py
+++ b/setup.py
@@ -64,10 +64,8 @@ def generate_cython_stub_file(pyx_filepath: str, output_filepath: str) -> None:
     # compile_flags = ["/Ox", "/std:c++20",'-fopenmp']
     # extra_link_args =['-lgomp','-fopenmp'],
 else:
-    compile_flags = ["-std=c++2a", "-O3"]
-    extra_link_args = []
-    # compile_flags = ["-std=c++2a", "-O3", "-fopenmp"]
-    # extra_link_args =['-lgomp','-fopenmp'],
+    compile_flags = ["-std=c++2a", "-O3", "-fopenmp"]
+    extra_link_args = ['-lgomp','-fopenmp']
 
 this_directory = Path(__file__).parent
 

From b57f8882132b18f2a53ca1c26f721455e9ab8fa6 Mon Sep 17 00:00:00 2001
From: WingCode <smallstar1234@gmail.com>
Date: Mon, 9 Jun 2025 19:26:02 +0200
Subject: [PATCH 3/7] Revert "Parallelise Union-Find Decoder"

This reverts commit b8f4d8a754c9a176a94ffd4f6f327df8f902622e.
---
 CMakeLists.txt | 5 +----
 setup.py       | 6 ++++--
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f608dda1..73a4eedd 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@ project(ldpc
     LANGUAGES CXX)
 
 # Enable OpenMP support for parallel programming
-find_package(OpenMP)
+# SET(CMAKE_CXX_FLAGS  "-fopenmp")
 
 # Ensure that the necessary C++11 features are available, as required by the RapidCSV library
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -24,9 +24,6 @@ option(BUILD_LDPC_TESTS "Also build tests for the LDPC project" ${LDPC_MASTER_PR
 
 # Define an interface library named 'ldpc' which only includes headers
 add_library(ldpc INTERFACE)
-if(OpenMP_CXX_FOUND)
-    target_link_libraries(ldpc INTERFACE OpenMP::OpenMP_CXX)
-endif()
 
 # Add specific directories to the include path for any target that links with the 'ldpc' library
 target_include_directories(ldpc INTERFACE src_cpp include/robin_map include/rapidcsv)
diff --git a/setup.py b/setup.py
index 69ad4ca2..27423240 100755
--- a/setup.py
+++ b/setup.py
@@ -64,8 +64,10 @@ def generate_cython_stub_file(pyx_filepath: str, output_filepath: str) -> None:
     # compile_flags = ["/Ox", "/std:c++20",'-fopenmp']
     # extra_link_args =['-lgomp','-fopenmp'],
 else:
-    compile_flags = ["-std=c++2a", "-O3", "-fopenmp"]
-    extra_link_args = ['-lgomp','-fopenmp']
+    compile_flags = ["-std=c++2a", "-O3"]
+    extra_link_args = []
+    # compile_flags = ["-std=c++2a", "-O3", "-fopenmp"]
+    # extra_link_args =['-lgomp','-fopenmp'],
 
 this_directory = Path(__file__).parent
 

From 2bfc16c9271e705006108eda15221f3cd9515e38 Mon Sep 17 00:00:00 2001
From: WingCode <smallstar1234@gmail.com>
Date: Mon, 9 Jun 2025 19:24:53 +0200
Subject: [PATCH 4/7] Parallelise Union-Find Decoder

---
 cpp_test/TestUnionFind.cpp                    | 12 ++++++
 python_test/test_union_find_parallel.py       | 22 ++++++++++
 src_cpp/union_find.hpp                        | 40 ++++++++++++-------
 .../ldpc/union_find_decoder/__init__.pyi      |  6 +++
 .../_union_find_decoder.pxd                   |  2 +
 .../_union_find_decoder.pyx                   | 12 +++++-
 6 files changed, 78 insertions(+), 16 deletions(-)
 create mode 100644 python_test/test_union_find_parallel.py

diff --git a/cpp_test/TestUnionFind.cpp b/cpp_test/TestUnionFind.cpp
index e6108903..6aa9e4fd 100644
--- a/cpp_test/TestUnionFind.cpp
+++ b/cpp_test/TestUnionFind.cpp
@@ -102,7 +102,19 @@ TEST(UfDecoder, HammingCode2){
         ASSERT_EQ(decoding_syndrome,syndrome);
 
     }
+}
 
+TEST(UfDecoderParallel, parallel_peeling){
+    auto pcm = ldpc::gf2codes::ring_code<ldpc::bp::BpEntry>(30);
+    std::vector<uint8_t> syndrome(pcm.m,0);
+    syndrome[0] = 1;
+    syndrome[1] = 1;
+    auto seq = UfDecoder(pcm);
+    auto par = UfDecoder(pcm);
+    par.set_omp_thread_count(4);
+    auto dec1 = seq.peel_decode(syndrome);
+    auto dec2 = par.peel_decode(syndrome);
+    ASSERT_EQ(dec1, dec2);
 }
 
 TEST(UfDecoder, ring_code3){
diff --git a/python_test/test_union_find_parallel.py b/python_test/test_union_find_parallel.py
new file mode 100644
index 00000000..eb269b0b
--- /dev/null
+++ b/python_test/test_union_find_parallel.py
@@ -0,0 +1,22 @@
+import numpy as np
+import scipy.sparse
+import time
+from ldpc.union_find_decoder import UnionFindDecoder
+
+def test_union_find_parallel_benchmark():
+    hx = scipy.sparse.load_npz('python_test/pcms/hx_surface_20.npz')
+    syndrome = np.zeros(hx.shape[0], dtype=np.uint8)
+    syndrome[0] = 1
+    syndrome[1] = 1
+    dec = UnionFindDecoder(hx, uf_method="")
+    dec.omp_thread_count = 1
+    t0 = time.perf_counter()
+    out1 = dec.decode(syndrome)
+    t1 = time.perf_counter() - t0
+    dec.omp_thread_count = 4
+    t0 = time.perf_counter()
+    out2 = dec.decode(syndrome)
+    t2 = time.perf_counter() - t0
+    assert np.array_equal(out1, out2)
+    # Simple check that multi-threaded decode is not significantly slower
+    assert t2 <= t1 * 2
diff --git a/src_cpp/union_find.hpp b/src_cpp/union_find.hpp
index 1129f764..a3370b88 100644
--- a/src_cpp/union_find.hpp
+++ b/src_cpp/union_find.hpp
@@ -418,7 +418,7 @@ namespace ldpc::uf {
             this->check_count = pcm.m;
             this->decoding.resize(this->bit_count);
             this->weighted = false;
-            this->omp_thread_count = omp_threads;
+            this->omp_thread_count = 1;
 
             this->pcm_max_bit_degree_2 = true;
             for (auto i = 0; i < this->pcm.n; i++) {
@@ -433,6 +433,10 @@ namespace ldpc::uf {
             }
         }
 
+        void set_omp_thread_count(int count) {
+            this->omp_thread_count = count;
+        }
+
         std::vector<uint8_t> &
         peel_decode(const std::vector<uint8_t> &syndrome, const std::vector<double> &bit_weights = EMPTY_DOUBLE_VECTOR,
                     int bits_per_step = 1) {
@@ -458,9 +462,9 @@ namespace ldpc::uf {
             }
 
             while (!invalid_clusters.empty()) {
-                #pragma omp parallel for num_threads(this->omp_thread_count) schedule(dynamic)
+                #pragma omp parallel for num_threads(this->omp_thread_count) schedule(static)
                 for (size_t i = 0; i < invalid_clusters.size(); i++) {
-                    Cluster *cl = invalid_clusters[i];
+                    auto cl = invalid_clusters[i];
                     if (cl->active) {
                         #pragma omp critical
                         cl->grow_cluster(bit_weights, bits_per_step);
@@ -468,28 +472,34 @@ namespace ldpc::uf {
                 }
 
                 invalid_clusters.clear();
-                #pragma omp parallel for num_threads(this->omp_thread_count)
-                for (size_t i = 0; i < clusters.size(); i++) {
-                    Cluster *cl = clusters[i];
-                    if (cl->active && cl->parity() == 1 && !cl->contains_boundary_bits) {
-                        #pragma omp critical
-                        invalid_clusters.push_back(cl);
+                std::vector<Cluster *> tmp_invalid;
+                #pragma omp parallel
+                {
+                    std::vector<Cluster *> local_invalid;
+                    #pragma omp for nowait schedule(static)
+                    for (size_t i = 0; i < clusters.size(); i++) {
+                        auto cl = clusters[i];
+                        if (cl->active && cl->parity() == 1 && !cl->contains_boundary_bits) {
+                            local_invalid.push_back(cl);
+                        }
                     }
+                    #pragma omp critical
+                    invalid_clusters.insert(invalid_clusters.end(), local_invalid.begin(), local_invalid.end());
                 }
                 std::sort(invalid_clusters.begin(), invalid_clusters.end(), [](const Cluster *lhs, const Cluster *rhs) {
                     return lhs->bit_nodes.size() < rhs->bit_nodes.size();
                 });
             }
-            #pragma omp parallel for num_threads(this->omp_thread_count)
+            #pragma omp parallel for num_threads(this->omp_thread_count) schedule(static)
             for (size_t i = 0; i < clusters.size(); i++) {
-                Cluster *cl = clusters[i];
+                auto cl = clusters[i];
                 if (cl->active) {
                     auto erasure = cl->peel_decode(syndrome);
-                    for (int bit: erasure) {
-                        #pragma omp critical
-                        this->decoding[bit] = 1;
-                    }
+                    #pragma omp critical
+                    for (int bit: erasure) this->decoding[bit] = 1;
                 }
+            }
+            for (auto cl: clusters) {
                 delete cl;
             }
             delete[] global_bit_membership;
diff --git a/src_python/ldpc/union_find_decoder/__init__.pyi b/src_python/ldpc/union_find_decoder/__init__.pyi
index 6d093642..d87fa0b9 100644
--- a/src_python/ldpc/union_find_decoder/__init__.pyi
+++ b/src_python/ldpc/union_find_decoder/__init__.pyi
@@ -49,5 +49,11 @@ class UnionFindDecoder:
             of the parity-check matrix.
         """
 
+    @property
+    def omp_thread_count(self) -> int: ...
+
+    @omp_thread_count.setter
+    def omp_thread_count(self, value: int) -> None: ...
+
     @property
     def decoding(self): ...
diff --git a/src_python/ldpc/union_find_decoder/_union_find_decoder.pxd b/src_python/ldpc/union_find_decoder/_union_find_decoder.pxd
index a7f20e38..3cb2609d 100644
--- a/src_python/ldpc/union_find_decoder/_union_find_decoder.pxd
+++ b/src_python/ldpc/union_find_decoder/_union_find_decoder.pxd
@@ -13,6 +13,8 @@ cdef extern from "union_find.hpp" namespace "ldpc::uf":
         uf_decoder_cpp(BpSparse& pcm, int omp_thread_count=1) except +
         vector[uint8_t]& peel_decode(vector[uint8_t]& syndrome, const vector[double]& bit_weights, int bits_per_step)
         vector[uint8_t]& matrix_decode(vector[uint8_t]& syndrome, const vector[double]& bit_weights, int bits_per_step)
+        void set_omp_thread_count(int count)
+        int omp_thread_count
         vector[uint8_t] decoding
 
     cdef const vector[double] EMPTY_DOUBLE_VECTOR "ldpc::uf::EMPTY_DOUBLE_VECTOR"
diff --git a/src_python/ldpc/union_find_decoder/_union_find_decoder.pyx b/src_python/ldpc/union_find_decoder/_union_find_decoder.pyx
index bf2bfc21..0fe4724e 100644
--- a/src_python/ldpc/union_find_decoder/_union_find_decoder.pyx
+++ b/src_python/ldpc/union_find_decoder/_union_find_decoder.pyx
@@ -160,9 +160,19 @@ cdef class UnionFindDecoder:
         out = np.zeros(self.n,dtype=DTYPE)
         for i in range(self.n):
             out[i] = self.ufd.decoding[i]
-        
+
         return out
 
+    @property
+    def omp_thread_count(self) -> int:
+        return self.ufd.omp_thread_count
+
+    @omp_thread_count.setter
+    def omp_thread_count(self, value: int) -> None:
+        if not isinstance(value, int) or value < 1:
+            raise TypeError("The omp_thread_count must be specified as a positive integer.")
+        self.ufd.set_omp_thread_count(value)
+
     @property
     def decoding(self):
 

From 843de63d81f12b4eae97eb225d9c69040f90d536 Mon Sep 17 00:00:00 2001
From: WingCode <smallstar1234@gmail.com>
Date: Mon, 9 Jun 2025 21:44:05 +0200
Subject: [PATCH 5/7] Fix cypthon compile

---
 src_python/ldpc/union_find_decoder/_union_find_decoder.pxd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src_python/ldpc/union_find_decoder/_union_find_decoder.pxd b/src_python/ldpc/union_find_decoder/_union_find_decoder.pxd
index 3cb2609d..3eeb7718 100644
--- a/src_python/ldpc/union_find_decoder/_union_find_decoder.pxd
+++ b/src_python/ldpc/union_find_decoder/_union_find_decoder.pxd
@@ -10,7 +10,7 @@ ctypedef np.uint8_t uint8_t
 
 cdef extern from "union_find.hpp" namespace "ldpc::uf":
     cdef cppclass uf_decoder_cpp "ldpc::uf::UfDecoder":
-        uf_decoder_cpp(BpSparse& pcm, int omp_thread_count=1) except +
+        uf_decoder_cpp(BpSparse& pcm, int omp_thread_count) except +
         vector[uint8_t]& peel_decode(vector[uint8_t]& syndrome, const vector[double]& bit_weights, int bits_per_step)
         vector[uint8_t]& matrix_decode(vector[uint8_t]& syndrome, const vector[double]& bit_weights, int bits_per_step)
         void set_omp_thread_count(int count)

From 5619c2836830e524f4a6d111d265cec7c9898cf1 Mon Sep 17 00:00:00 2001
From: WingCode <smallstar1234@gmail.com>
Date: Tue, 10 Jun 2025 00:06:30 +0200
Subject: [PATCH 6/7] Enchance benchmark

---
 python_test/test_union_find_parallel.py | 56 ++++++++++++++++++-------
 1 file changed, 41 insertions(+), 15 deletions(-)

diff --git a/python_test/test_union_find_parallel.py b/python_test/test_union_find_parallel.py
index eb269b0b..6f3462e5 100644
--- a/python_test/test_union_find_parallel.py
+++ b/python_test/test_union_find_parallel.py
@@ -3,20 +3,46 @@
 import time
 from ldpc.union_find_decoder import UnionFindDecoder
 
-def test_union_find_parallel_benchmark():
-    hx = scipy.sparse.load_npz('python_test/pcms/hx_surface_20.npz')
-    syndrome = np.zeros(hx.shape[0], dtype=np.uint8)
-    syndrome[0] = 1
-    syndrome[1] = 1
+
+# Helper function to decode a batch of syndromes and return the
+# average time per decode along with the decode result of the first
+# syndrome (for correctness comparisons).
+def _benchmark_decode(hx, syndromes, thread_count):
     dec = UnionFindDecoder(hx, uf_method="")
-    dec.omp_thread_count = 1
-    t0 = time.perf_counter()
-    out1 = dec.decode(syndrome)
-    t1 = time.perf_counter() - t0
-    dec.omp_thread_count = 4
+    dec.omp_thread_count = thread_count
     t0 = time.perf_counter()
-    out2 = dec.decode(syndrome)
-    t2 = time.perf_counter() - t0
-    assert np.array_equal(out1, out2)
-    # Simple check that multi-threaded decode is not significantly slower
-    assert t2 <= t1 * 2
+    for syn in syndromes:
+        dec.decode(np.ascontiguousarray(syn))
+    avg_time = (time.perf_counter() - t0) / len(syndromes)
+    first_out = dec.decode(np.ascontiguousarray(syndromes[0]))
+    return avg_time, first_out
+
+
+def test_union_find_parallel_benchmark():
+    hx = scipy.sparse.load_npz("python_test/pcms/hx_surface_20.npz").tocsr()
+    rng = np.random.default_rng(0)
+    # Using 128 samples keeps the runtime manageable while still providing
+    # a reasonable estimate of performance. Larger sample sizes caused the
+    # decoder to hang in this environment.
+    num_samples = 128
+    thread_counts = [1, 2, 4, 8]
+    # Higher error rates occasionally caused the decoder to hang during
+    # testing, so we restrict the range here.
+    ps = np.linspace(0.01, 0.05, 3)
+
+    results = {}
+    for p in ps:
+        errors = (rng.random((num_samples, hx.shape[1])) < p).astype(np.uint8)
+        syndromes = (hx.dot(errors.T) % 2).astype(np.uint8).T
+
+        avg_1, ref = _benchmark_decode(hx, syndromes, thread_counts[0])
+        results[(p, thread_counts[0])] = avg_1
+        print(f"p={p:.2f} threads={thread_counts[0]} avg_time={avg_1:.6f}s")
+
+        for t in thread_counts[1:]:
+            avg_t, out = _benchmark_decode(hx, syndromes, t)
+            results[(p, t)] = avg_t
+            print(f"p={p:.2f} threads={t} avg_time={avg_t:.6f}s")
+            assert np.array_equal(out, ref)
+
+        assert results[(p, thread_counts[-1])] <= results[(p, thread_counts[0])] * 2

From 5f86e4adf0a6b2cdb6c92cbd9b75540da48e95c7 Mon Sep 17 00:00:00 2001
From: WingCode <smallstar1234@gmail.com>
Date: Tue, 10 Jun 2025 16:09:23 +0200
Subject: [PATCH 7/7] Try enchance parallelisation

---
 src_cpp/union_find.hpp | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/src_cpp/union_find.hpp b/src_cpp/union_find.hpp
index a3370b88..df509b03 100644
--- a/src_cpp/union_find.hpp
+++ b/src_cpp/union_find.hpp
@@ -15,6 +15,9 @@
 #include <robin_set.h>
 #include <numeric>
 #include <mutex>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 
 #include "gf2sparse_linalg.hpp"
 #include "bp.hpp"
@@ -466,25 +469,29 @@ namespace ldpc::uf {
                 for (size_t i = 0; i < invalid_clusters.size(); i++) {
                     auto cl = invalid_clusters[i];
                     if (cl->active) {
-                        #pragma omp critical
                         cl->grow_cluster(bit_weights, bits_per_step);
                     }
                 }
 
                 invalid_clusters.clear();
-                std::vector<Cluster *> tmp_invalid;
-                #pragma omp parallel
+                std::vector<std::vector<Cluster *>> local_invalid_vec(this->omp_thread_count);
+                #pragma omp parallel num_threads(this->omp_thread_count)
                 {
-                    std::vector<Cluster *> local_invalid;
-                    #pragma omp for nowait schedule(static)
+                    int tid = 0;
+                    #ifdef _OPENMP
+                    tid = omp_get_thread_num();
+                    #endif
+                    auto &local_invalid = local_invalid_vec[tid];
+                    #pragma omp for schedule(static)
                     for (size_t i = 0; i < clusters.size(); i++) {
                         auto cl = clusters[i];
                         if (cl->active && cl->parity() == 1 && !cl->contains_boundary_bits) {
                             local_invalid.push_back(cl);
                         }
                     }
-                    #pragma omp critical
-                    invalid_clusters.insert(invalid_clusters.end(), local_invalid.begin(), local_invalid.end());
+                }
+                for (auto &vec : local_invalid_vec) {
+                    invalid_clusters.insert(invalid_clusters.end(), vec.begin(), vec.end());
                 }
                 std::sort(invalid_clusters.begin(), invalid_clusters.end(), [](const Cluster *lhs, const Cluster *rhs) {
                     return lhs->bit_nodes.size() < rhs->bit_nodes.size();
@@ -495,8 +502,10 @@ namespace ldpc::uf {
                 auto cl = clusters[i];
                 if (cl->active) {
                     auto erasure = cl->peel_decode(syndrome);
-                    #pragma omp critical
-                    for (int bit: erasure) this->decoding[bit] = 1;
+                    for (int bit: erasure) {
+                        #pragma omp atomic write
+                        this->decoding[bit] = 1;
+                    }
                 }
             }
             for (auto cl: clusters) {