From 9f66822b3c3cc97fed1a18e8a412d6a96ec60d8b Mon Sep 17 00:00:00 2001 From: WingCode Date: Mon, 9 Jun 2025 01:04:13 +0200 Subject: [PATCH 1/7] Parallelise Union-Find Decoder --- cpp_test/TestUnionFind.cpp | 12 +++++++++ src_cpp/union_find.hpp | 27 +++++++++++++++---- .../ldpc/union_find_decoder/__init__.pyi | 2 +- .../_union_find_decoder.pxd | 5 ++-- .../_union_find_decoder.pyx | 5 ++-- 5 files changed, 41 insertions(+), 10 deletions(-) diff --git a/cpp_test/TestUnionFind.cpp b/cpp_test/TestUnionFind.cpp index 26aad77a..e6108903 100644 --- a/cpp_test/TestUnionFind.cpp +++ b/cpp_test/TestUnionFind.cpp @@ -209,6 +209,18 @@ TEST(UfDecoder, peeling_with_boundaries_edge_case){ } +TEST(UfDecoder, parallel_peeling){ + auto pcm = ldpc::gf2codes::ring_code(10); + UfDecoder ufd(pcm,4); + std::vector syndrome(pcm.m,0); + syndrome[0]=1; + syndrome[1]=1; + auto decoding = ufd.peel_decode(syndrome); + std::vector expected(pcm.n,0); + expected[1]=1; + ASSERT_EQ(decoding,expected); +} + int main(int argc, char **argv) diff --git a/src_cpp/union_find.hpp b/src_cpp/union_find.hpp index d316c5b4..1129f764 100644 --- a/src_cpp/union_find.hpp +++ b/src_cpp/union_find.hpp @@ -14,6 +14,7 @@ #include #include #include +#include #include "gf2sparse_linalg.hpp" #include "bp.hpp" @@ -22,6 +23,7 @@ namespace ldpc::uf { const std::vector EMPTY_DOUBLE_VECTOR = {}; tsl::robin_set EMPTY_INT_ROBIN_SET = {}; + inline std::mutex uf_global_mutex; std::vector sort_indices(std::vector &B) { std::vector indices(B.size()); @@ -107,6 +109,7 @@ namespace ldpc::uf { } int add_bit_node_to_cluster(int bit_index) { + std::lock_guard lk(uf_global_mutex); auto bit_membership = this->global_bit_membership[bit_index]; if (bit_membership == this) return 0; //if the bit is already in the cluster terminate. else if (bit_membership == NULL) { @@ -141,6 +144,7 @@ namespace ldpc::uf { } void merge_with_cluster(Cluster *cl2) { + std::lock_guard lk(uf_global_mutex); for (auto bit_index: cl2->bit_nodes) { this->bit_nodes.insert(bit_index); this->global_bit_membership[bit_index] = this; @@ -407,12 +411,14 @@ namespace ldpc::uf { int check_count; tsl::robin_set planar_code_boundary_bits; bool pcm_max_bit_degree_2; + int omp_thread_count; - UfDecoder(ldpc::bp::BpSparse &parity_check_matrix) : pcm(parity_check_matrix) { + UfDecoder(ldpc::bp::BpSparse &parity_check_matrix, int omp_threads = 1) : pcm(parity_check_matrix) { this->bit_count = pcm.n; this->check_count = pcm.m; this->decoding.resize(this->bit_count); this->weighted = false; + this->omp_thread_count = omp_threads; this->pcm_max_bit_degree_2 = true; for (auto i = 0; i < this->pcm.n; i++) { @@ -452,15 +458,21 @@ namespace ldpc::uf { } while (!invalid_clusters.empty()) { - for (auto cl: invalid_clusters) { + #pragma omp parallel for num_threads(this->omp_thread_count) schedule(dynamic) + for (size_t i = 0; i < invalid_clusters.size(); i++) { + Cluster *cl = invalid_clusters[i]; if (cl->active) { + #pragma omp critical cl->grow_cluster(bit_weights, bits_per_step); } } invalid_clusters.clear(); - for (auto cl: clusters) { + #pragma omp parallel for num_threads(this->omp_thread_count) + for (size_t i = 0; i < clusters.size(); i++) { + Cluster *cl = clusters[i]; if (cl->active && cl->parity() == 1 && !cl->contains_boundary_bits) { + #pragma omp critical invalid_clusters.push_back(cl); } } @@ -468,10 +480,15 @@ namespace ldpc::uf { return lhs->bit_nodes.size() < rhs->bit_nodes.size(); }); } - for (auto cl: clusters) { + #pragma omp parallel for num_threads(this->omp_thread_count) + for (size_t i = 0; i < clusters.size(); i++) { + Cluster *cl = clusters[i]; if (cl->active) { auto erasure = cl->peel_decode(syndrome); - for (int bit: erasure) this->decoding[bit] = 1; + for (int bit: erasure) { + #pragma omp critical + this->decoding[bit] = 1; + } } delete cl; } diff --git a/src_python/ldpc/union_find_decoder/__init__.pyi b/src_python/ldpc/union_find_decoder/__init__.pyi index c341787a..6d093642 100644 --- a/src_python/ldpc/union_find_decoder/__init__.pyi +++ b/src_python/ldpc/union_find_decoder/__init__.pyi @@ -18,7 +18,7 @@ class UnionFindDecoder: Default is False. """ - def __cinit__(self, pcm: Union[np.ndarray, spmatrix], uf_method: str = False): ... + def __cinit__(self, pcm: Union[np.ndarray, spmatrix], uf_method: str = False, omp_thread_count: int = 1): ... def __del__(self): ... diff --git a/src_python/ldpc/union_find_decoder/_union_find_decoder.pxd b/src_python/ldpc/union_find_decoder/_union_find_decoder.pxd index 98bad540..a7f20e38 100644 --- a/src_python/ldpc/union_find_decoder/_union_find_decoder.pxd +++ b/src_python/ldpc/union_find_decoder/_union_find_decoder.pxd @@ -10,7 +10,7 @@ ctypedef np.uint8_t uint8_t cdef extern from "union_find.hpp" namespace "ldpc::uf": cdef cppclass uf_decoder_cpp "ldpc::uf::UfDecoder": - uf_decoder_cpp(BpSparse& pcm) except + + uf_decoder_cpp(BpSparse& pcm, int omp_thread_count=1) except + vector[uint8_t]& peel_decode(vector[uint8_t]& syndrome, const vector[double]& bit_weights, int bits_per_step) vector[uint8_t]& matrix_decode(vector[uint8_t]& syndrome, const vector[double]& bit_weights, int bits_per_step) vector[uint8_t] decoding @@ -26,4 +26,5 @@ cdef class UnionFindDecoder(): cdef vector[uint8_t] _syndrome cdef vector[double] uf_llrs cdef bool uf_method - cdef int bits_per_step \ No newline at end of file + cdef int bits_per_step + cdef int omp_thread_count \ No newline at end of file diff --git a/src_python/ldpc/union_find_decoder/_union_find_decoder.pyx b/src_python/ldpc/union_find_decoder/_union_find_decoder.pyx index 8bb97b80..bf2bfc21 100644 --- a/src_python/ldpc/union_find_decoder/_union_find_decoder.pyx +++ b/src_python/ldpc/union_find_decoder/_union_find_decoder.pyx @@ -60,7 +60,7 @@ cdef class UnionFindDecoder: Default is False. """ - def __cinit__(self, pcm: Union[np.ndarray, spmatrix], uf_method: str = False): + def __cinit__(self, pcm: Union[np.ndarray, spmatrix], uf_method: str = False, omp_thread_count: int = 1): self.MEMORY_ALLOCATED=False @@ -75,7 +75,8 @@ cdef class UnionFindDecoder: # get the parity check dimensions self.m, self.n = pcm.shape[0], pcm.shape[1] - self.ufd = new uf_decoder_cpp(self.pcm[0]) + self.ufd = new uf_decoder_cpp(self.pcm[0], omp_thread_count) + self.omp_thread_count = omp_thread_count self._syndrome.resize(self.m) #C vector for the syndrome self.uf_llrs.resize(self.n) #C vector for the log-likehood ratios self.uf_method = uf_method From b8f4d8a754c9a176a94ffd4f6f327df8f902622e Mon Sep 17 00:00:00 2001 From: WingCode Date: Mon, 9 Jun 2025 01:34:45 +0200 Subject: [PATCH 2/7] Parallelise Union-Find Decoder --- CMakeLists.txt | 5 ++++- setup.py | 6 ++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 73a4eedd..f608dda1 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ project(ldpc LANGUAGES CXX) # Enable OpenMP support for parallel programming -# SET(CMAKE_CXX_FLAGS "-fopenmp") +find_package(OpenMP) # Ensure that the necessary C++11 features are available, as required by the RapidCSV library set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -24,6 +24,9 @@ option(BUILD_LDPC_TESTS "Also build tests for the LDPC project" ${LDPC_MASTER_PR # Define an interface library named 'ldpc' which only includes headers add_library(ldpc INTERFACE) +if(OpenMP_CXX_FOUND) + target_link_libraries(ldpc INTERFACE OpenMP::OpenMP_CXX) +endif() # Add specific directories to the include path for any target that links with the 'ldpc' library target_include_directories(ldpc INTERFACE src_cpp include/robin_map include/rapidcsv) diff --git a/setup.py b/setup.py index 27423240..69ad4ca2 100755 --- a/setup.py +++ b/setup.py @@ -64,10 +64,8 @@ def generate_cython_stub_file(pyx_filepath: str, output_filepath: str) -> None: # compile_flags = ["/Ox", "/std:c++20",'-fopenmp'] # extra_link_args =['-lgomp','-fopenmp'], else: - compile_flags = ["-std=c++2a", "-O3"] - extra_link_args = [] - # compile_flags = ["-std=c++2a", "-O3", "-fopenmp"] - # extra_link_args =['-lgomp','-fopenmp'], + compile_flags = ["-std=c++2a", "-O3", "-fopenmp"] + extra_link_args = ['-lgomp','-fopenmp'] this_directory = Path(__file__).parent From b57f8882132b18f2a53ca1c26f721455e9ab8fa6 Mon Sep 17 00:00:00 2001 From: WingCode Date: Mon, 9 Jun 2025 19:26:02 +0200 Subject: [PATCH 3/7] Revert "Parallelise Union-Find Decoder" This reverts commit b8f4d8a754c9a176a94ffd4f6f327df8f902622e. --- CMakeLists.txt | 5 +---- setup.py | 6 ++++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f608dda1..73a4eedd 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ project(ldpc LANGUAGES CXX) # Enable OpenMP support for parallel programming -find_package(OpenMP) +# SET(CMAKE_CXX_FLAGS "-fopenmp") # Ensure that the necessary C++11 features are available, as required by the RapidCSV library set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -24,9 +24,6 @@ option(BUILD_LDPC_TESTS "Also build tests for the LDPC project" ${LDPC_MASTER_PR # Define an interface library named 'ldpc' which only includes headers add_library(ldpc INTERFACE) -if(OpenMP_CXX_FOUND) - target_link_libraries(ldpc INTERFACE OpenMP::OpenMP_CXX) -endif() # Add specific directories to the include path for any target that links with the 'ldpc' library target_include_directories(ldpc INTERFACE src_cpp include/robin_map include/rapidcsv) diff --git a/setup.py b/setup.py index 69ad4ca2..27423240 100755 --- a/setup.py +++ b/setup.py @@ -64,8 +64,10 @@ def generate_cython_stub_file(pyx_filepath: str, output_filepath: str) -> None: # compile_flags = ["/Ox", "/std:c++20",'-fopenmp'] # extra_link_args =['-lgomp','-fopenmp'], else: - compile_flags = ["-std=c++2a", "-O3", "-fopenmp"] - extra_link_args = ['-lgomp','-fopenmp'] + compile_flags = ["-std=c++2a", "-O3"] + extra_link_args = [] + # compile_flags = ["-std=c++2a", "-O3", "-fopenmp"] + # extra_link_args =['-lgomp','-fopenmp'], this_directory = Path(__file__).parent From 2bfc16c9271e705006108eda15221f3cd9515e38 Mon Sep 17 00:00:00 2001 From: WingCode Date: Mon, 9 Jun 2025 19:24:53 +0200 Subject: [PATCH 4/7] Parallelise Union-Find Decoder --- cpp_test/TestUnionFind.cpp | 12 ++++++ python_test/test_union_find_parallel.py | 22 ++++++++++ src_cpp/union_find.hpp | 40 ++++++++++++------- .../ldpc/union_find_decoder/__init__.pyi | 6 +++ .../_union_find_decoder.pxd | 2 + .../_union_find_decoder.pyx | 12 +++++- 6 files changed, 78 insertions(+), 16 deletions(-) create mode 100644 python_test/test_union_find_parallel.py diff --git a/cpp_test/TestUnionFind.cpp b/cpp_test/TestUnionFind.cpp index e6108903..6aa9e4fd 100644 --- a/cpp_test/TestUnionFind.cpp +++ b/cpp_test/TestUnionFind.cpp @@ -102,7 +102,19 @@ TEST(UfDecoder, HammingCode2){ ASSERT_EQ(decoding_syndrome,syndrome); } +} +TEST(UfDecoderParallel, parallel_peeling){ + auto pcm = ldpc::gf2codes::ring_code(30); + std::vector syndrome(pcm.m,0); + syndrome[0] = 1; + syndrome[1] = 1; + auto seq = UfDecoder(pcm); + auto par = UfDecoder(pcm); + par.set_omp_thread_count(4); + auto dec1 = seq.peel_decode(syndrome); + auto dec2 = par.peel_decode(syndrome); + ASSERT_EQ(dec1, dec2); } TEST(UfDecoder, ring_code3){ diff --git a/python_test/test_union_find_parallel.py b/python_test/test_union_find_parallel.py new file mode 100644 index 00000000..eb269b0b --- /dev/null +++ b/python_test/test_union_find_parallel.py @@ -0,0 +1,22 @@ +import numpy as np +import scipy.sparse +import time +from ldpc.union_find_decoder import UnionFindDecoder + +def test_union_find_parallel_benchmark(): + hx = scipy.sparse.load_npz('python_test/pcms/hx_surface_20.npz') + syndrome = np.zeros(hx.shape[0], dtype=np.uint8) + syndrome[0] = 1 + syndrome[1] = 1 + dec = UnionFindDecoder(hx, uf_method="") + dec.omp_thread_count = 1 + t0 = time.perf_counter() + out1 = dec.decode(syndrome) + t1 = time.perf_counter() - t0 + dec.omp_thread_count = 4 + t0 = time.perf_counter() + out2 = dec.decode(syndrome) + t2 = time.perf_counter() - t0 + assert np.array_equal(out1, out2) + # Simple check that multi-threaded decode is not significantly slower + assert t2 <= t1 * 2 diff --git a/src_cpp/union_find.hpp b/src_cpp/union_find.hpp index 1129f764..a3370b88 100644 --- a/src_cpp/union_find.hpp +++ b/src_cpp/union_find.hpp @@ -418,7 +418,7 @@ namespace ldpc::uf { this->check_count = pcm.m; this->decoding.resize(this->bit_count); this->weighted = false; - this->omp_thread_count = omp_threads; + this->omp_thread_count = 1; this->pcm_max_bit_degree_2 = true; for (auto i = 0; i < this->pcm.n; i++) { @@ -433,6 +433,10 @@ namespace ldpc::uf { } } + void set_omp_thread_count(int count) { + this->omp_thread_count = count; + } + std::vector & peel_decode(const std::vector &syndrome, const std::vector &bit_weights = EMPTY_DOUBLE_VECTOR, int bits_per_step = 1) { @@ -458,9 +462,9 @@ namespace ldpc::uf { } while (!invalid_clusters.empty()) { - #pragma omp parallel for num_threads(this->omp_thread_count) schedule(dynamic) + #pragma omp parallel for num_threads(this->omp_thread_count) schedule(static) for (size_t i = 0; i < invalid_clusters.size(); i++) { - Cluster *cl = invalid_clusters[i]; + auto cl = invalid_clusters[i]; if (cl->active) { #pragma omp critical cl->grow_cluster(bit_weights, bits_per_step); @@ -468,28 +472,34 @@ namespace ldpc::uf { } invalid_clusters.clear(); - #pragma omp parallel for num_threads(this->omp_thread_count) - for (size_t i = 0; i < clusters.size(); i++) { - Cluster *cl = clusters[i]; - if (cl->active && cl->parity() == 1 && !cl->contains_boundary_bits) { - #pragma omp critical - invalid_clusters.push_back(cl); + std::vector tmp_invalid; + #pragma omp parallel + { + std::vector local_invalid; + #pragma omp for nowait schedule(static) + for (size_t i = 0; i < clusters.size(); i++) { + auto cl = clusters[i]; + if (cl->active && cl->parity() == 1 && !cl->contains_boundary_bits) { + local_invalid.push_back(cl); + } } + #pragma omp critical + invalid_clusters.insert(invalid_clusters.end(), local_invalid.begin(), local_invalid.end()); } std::sort(invalid_clusters.begin(), invalid_clusters.end(), [](const Cluster *lhs, const Cluster *rhs) { return lhs->bit_nodes.size() < rhs->bit_nodes.size(); }); } - #pragma omp parallel for num_threads(this->omp_thread_count) + #pragma omp parallel for num_threads(this->omp_thread_count) schedule(static) for (size_t i = 0; i < clusters.size(); i++) { - Cluster *cl = clusters[i]; + auto cl = clusters[i]; if (cl->active) { auto erasure = cl->peel_decode(syndrome); - for (int bit: erasure) { - #pragma omp critical - this->decoding[bit] = 1; - } + #pragma omp critical + for (int bit: erasure) this->decoding[bit] = 1; } + } + for (auto cl: clusters) { delete cl; } delete[] global_bit_membership; diff --git a/src_python/ldpc/union_find_decoder/__init__.pyi b/src_python/ldpc/union_find_decoder/__init__.pyi index 6d093642..d87fa0b9 100644 --- a/src_python/ldpc/union_find_decoder/__init__.pyi +++ b/src_python/ldpc/union_find_decoder/__init__.pyi @@ -49,5 +49,11 @@ class UnionFindDecoder: of the parity-check matrix. """ + @property + def omp_thread_count(self) -> int: ... + + @omp_thread_count.setter + def omp_thread_count(self, value: int) -> None: ... + @property def decoding(self): ... diff --git a/src_python/ldpc/union_find_decoder/_union_find_decoder.pxd b/src_python/ldpc/union_find_decoder/_union_find_decoder.pxd index a7f20e38..3cb2609d 100644 --- a/src_python/ldpc/union_find_decoder/_union_find_decoder.pxd +++ b/src_python/ldpc/union_find_decoder/_union_find_decoder.pxd @@ -13,6 +13,8 @@ cdef extern from "union_find.hpp" namespace "ldpc::uf": uf_decoder_cpp(BpSparse& pcm, int omp_thread_count=1) except + vector[uint8_t]& peel_decode(vector[uint8_t]& syndrome, const vector[double]& bit_weights, int bits_per_step) vector[uint8_t]& matrix_decode(vector[uint8_t]& syndrome, const vector[double]& bit_weights, int bits_per_step) + void set_omp_thread_count(int count) + int omp_thread_count vector[uint8_t] decoding cdef const vector[double] EMPTY_DOUBLE_VECTOR "ldpc::uf::EMPTY_DOUBLE_VECTOR" diff --git a/src_python/ldpc/union_find_decoder/_union_find_decoder.pyx b/src_python/ldpc/union_find_decoder/_union_find_decoder.pyx index bf2bfc21..0fe4724e 100644 --- a/src_python/ldpc/union_find_decoder/_union_find_decoder.pyx +++ b/src_python/ldpc/union_find_decoder/_union_find_decoder.pyx @@ -160,9 +160,19 @@ cdef class UnionFindDecoder: out = np.zeros(self.n,dtype=DTYPE) for i in range(self.n): out[i] = self.ufd.decoding[i] - + return out + @property + def omp_thread_count(self) -> int: + return self.ufd.omp_thread_count + + @omp_thread_count.setter + def omp_thread_count(self, value: int) -> None: + if not isinstance(value, int) or value < 1: + raise TypeError("The omp_thread_count must be specified as a positive integer.") + self.ufd.set_omp_thread_count(value) + @property def decoding(self): From 843de63d81f12b4eae97eb225d9c69040f90d536 Mon Sep 17 00:00:00 2001 From: WingCode Date: Mon, 9 Jun 2025 21:44:05 +0200 Subject: [PATCH 5/7] Fix cypthon compile --- src_python/ldpc/union_find_decoder/_union_find_decoder.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src_python/ldpc/union_find_decoder/_union_find_decoder.pxd b/src_python/ldpc/union_find_decoder/_union_find_decoder.pxd index 3cb2609d..3eeb7718 100644 --- a/src_python/ldpc/union_find_decoder/_union_find_decoder.pxd +++ b/src_python/ldpc/union_find_decoder/_union_find_decoder.pxd @@ -10,7 +10,7 @@ ctypedef np.uint8_t uint8_t cdef extern from "union_find.hpp" namespace "ldpc::uf": cdef cppclass uf_decoder_cpp "ldpc::uf::UfDecoder": - uf_decoder_cpp(BpSparse& pcm, int omp_thread_count=1) except + + uf_decoder_cpp(BpSparse& pcm, int omp_thread_count) except + vector[uint8_t]& peel_decode(vector[uint8_t]& syndrome, const vector[double]& bit_weights, int bits_per_step) vector[uint8_t]& matrix_decode(vector[uint8_t]& syndrome, const vector[double]& bit_weights, int bits_per_step) void set_omp_thread_count(int count) From 5619c2836830e524f4a6d111d265cec7c9898cf1 Mon Sep 17 00:00:00 2001 From: WingCode Date: Tue, 10 Jun 2025 00:06:30 +0200 Subject: [PATCH 6/7] Enchance benchmark --- python_test/test_union_find_parallel.py | 56 ++++++++++++++++++------- 1 file changed, 41 insertions(+), 15 deletions(-) diff --git a/python_test/test_union_find_parallel.py b/python_test/test_union_find_parallel.py index eb269b0b..6f3462e5 100644 --- a/python_test/test_union_find_parallel.py +++ b/python_test/test_union_find_parallel.py @@ -3,20 +3,46 @@ import time from ldpc.union_find_decoder import UnionFindDecoder -def test_union_find_parallel_benchmark(): - hx = scipy.sparse.load_npz('python_test/pcms/hx_surface_20.npz') - syndrome = np.zeros(hx.shape[0], dtype=np.uint8) - syndrome[0] = 1 - syndrome[1] = 1 + +# Helper function to decode a batch of syndromes and return the +# average time per decode along with the decode result of the first +# syndrome (for correctness comparisons). +def _benchmark_decode(hx, syndromes, thread_count): dec = UnionFindDecoder(hx, uf_method="") - dec.omp_thread_count = 1 - t0 = time.perf_counter() - out1 = dec.decode(syndrome) - t1 = time.perf_counter() - t0 - dec.omp_thread_count = 4 + dec.omp_thread_count = thread_count t0 = time.perf_counter() - out2 = dec.decode(syndrome) - t2 = time.perf_counter() - t0 - assert np.array_equal(out1, out2) - # Simple check that multi-threaded decode is not significantly slower - assert t2 <= t1 * 2 + for syn in syndromes: + dec.decode(np.ascontiguousarray(syn)) + avg_time = (time.perf_counter() - t0) / len(syndromes) + first_out = dec.decode(np.ascontiguousarray(syndromes[0])) + return avg_time, first_out + + +def test_union_find_parallel_benchmark(): + hx = scipy.sparse.load_npz("python_test/pcms/hx_surface_20.npz").tocsr() + rng = np.random.default_rng(0) + # Using 128 samples keeps the runtime manageable while still providing + # a reasonable estimate of performance. Larger sample sizes caused the + # decoder to hang in this environment. + num_samples = 128 + thread_counts = [1, 2, 4, 8] + # Higher error rates occasionally caused the decoder to hang during + # testing, so we restrict the range here. + ps = np.linspace(0.01, 0.05, 3) + + results = {} + for p in ps: + errors = (rng.random((num_samples, hx.shape[1])) < p).astype(np.uint8) + syndromes = (hx.dot(errors.T) % 2).astype(np.uint8).T + + avg_1, ref = _benchmark_decode(hx, syndromes, thread_counts[0]) + results[(p, thread_counts[0])] = avg_1 + print(f"p={p:.2f} threads={thread_counts[0]} avg_time={avg_1:.6f}s") + + for t in thread_counts[1:]: + avg_t, out = _benchmark_decode(hx, syndromes, t) + results[(p, t)] = avg_t + print(f"p={p:.2f} threads={t} avg_time={avg_t:.6f}s") + assert np.array_equal(out, ref) + + assert results[(p, thread_counts[-1])] <= results[(p, thread_counts[0])] * 2 From 5f86e4adf0a6b2cdb6c92cbd9b75540da48e95c7 Mon Sep 17 00:00:00 2001 From: WingCode Date: Tue, 10 Jun 2025 16:09:23 +0200 Subject: [PATCH 7/7] Try enchance parallelisation --- src_cpp/union_find.hpp | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src_cpp/union_find.hpp b/src_cpp/union_find.hpp index a3370b88..df509b03 100644 --- a/src_cpp/union_find.hpp +++ b/src_cpp/union_find.hpp @@ -15,6 +15,9 @@ #include #include #include +#ifdef _OPENMP +#include +#endif #include "gf2sparse_linalg.hpp" #include "bp.hpp" @@ -466,25 +469,29 @@ namespace ldpc::uf { for (size_t i = 0; i < invalid_clusters.size(); i++) { auto cl = invalid_clusters[i]; if (cl->active) { - #pragma omp critical cl->grow_cluster(bit_weights, bits_per_step); } } invalid_clusters.clear(); - std::vector tmp_invalid; - #pragma omp parallel + std::vector> local_invalid_vec(this->omp_thread_count); + #pragma omp parallel num_threads(this->omp_thread_count) { - std::vector local_invalid; - #pragma omp for nowait schedule(static) + int tid = 0; + #ifdef _OPENMP + tid = omp_get_thread_num(); + #endif + auto &local_invalid = local_invalid_vec[tid]; + #pragma omp for schedule(static) for (size_t i = 0; i < clusters.size(); i++) { auto cl = clusters[i]; if (cl->active && cl->parity() == 1 && !cl->contains_boundary_bits) { local_invalid.push_back(cl); } } - #pragma omp critical - invalid_clusters.insert(invalid_clusters.end(), local_invalid.begin(), local_invalid.end()); + } + for (auto &vec : local_invalid_vec) { + invalid_clusters.insert(invalid_clusters.end(), vec.begin(), vec.end()); } std::sort(invalid_clusters.begin(), invalid_clusters.end(), [](const Cluster *lhs, const Cluster *rhs) { return lhs->bit_nodes.size() < rhs->bit_nodes.size(); @@ -495,8 +502,10 @@ namespace ldpc::uf { auto cl = clusters[i]; if (cl->active) { auto erasure = cl->peel_decode(syndrome); - #pragma omp critical - for (int bit: erasure) this->decoding[bit] = 1; + for (int bit: erasure) { + #pragma omp atomic write + this->decoding[bit] = 1; + } } } for (auto cl: clusters) {