From e52b66d3bf97daef808493ed53f1fb1d2af35573 Mon Sep 17 00:00:00 2001 From: Muhammad Alhroob Date: Mon, 10 Nov 2025 11:02:37 +0100 Subject: [PATCH 1/3] Rewrite the matrix multplication function. rearrange the loops and use blocking to enhance the speed (a factor of 6 for very large matrices) --- math/matrix/src/TMatrixT.cxx | 58 ++++++++++++++++++++++++++++-------- 1 file changed, 45 insertions(+), 13 deletions(-) diff --git a/math/matrix/src/TMatrixT.cxx b/math/matrix/src/TMatrixT.cxx index ad07f1ed5c1a5..12613fe84cf19 100644 --- a/math/matrix/src/TMatrixT.cxx +++ b/math/matrix/src/TMatrixT.cxx @@ -3076,23 +3076,55 @@ TMatrixT &TMatrixTAutoloadOps::ElementDiv(TMatrixT &target, co //////////////////////////////////////////////////////////////////////////////// /// Elementary routine to calculate matrix multiplication A*B + template void TMatrixTAutoloadOps::AMultB(const Element *const ap, Int_t na, Int_t ncolsa, const Element *const bp, Int_t nb, - Int_t ncolsb, Element *cp) -{ - const Element *arp0 = ap; // Pointer to A[i,0]; - while (arp0 < ap + na) { - for (const Element *bcp = bp; bcp < bp + ncolsb;) { // Pointer to the j-th column of B, Start bcp = B[0,0] - const Element *arp = arp0; // Pointer to the i-th row of A, reset to A[i,0] - Element cij = 0; - while (bcp < bp + nb) { // Scan the i-th row of A and - cij += *arp++ * *bcp; // the j-th col of B - bcp += ncolsb; + Int_t ncolsb, Element *cp) { + // i,k,j loop order with blocking and unrolling + const Int_t M = na / ncolsa; // Rows of A + const Int_t N = ncolsa; // Columns of A, rows of B + const Int_t P = ncolsb; // Columns of B and C + + const Int_t BLOCK = 32; + + #ifdef _OPENMP + #pragma omp parallel for collapse(2) if(M * P > 10000) + #endif + for (Int_t i0 = 0; i0 < M; i0 += BLOCK) { + for (Int_t k0 = 0; k0 < N; k0 += BLOCK) { + for (Int_t j0 = 0; j0 < P; j0 += BLOCK) { + const Int_t iMax = (i0 + BLOCK < M) ? i0 + BLOCK : M; + const Int_t kMax = (k0 + BLOCK < N) ? k0 + BLOCK : N; + const Int_t jMax = (j0 + BLOCK < P) ? j0 + BLOCK : P; + for (Int_t i = i0; i < iMax; ++i) { + for (Int_t k = k0; k < kMax; ++k) { + Element aik = ap[i * N + k]; // Hoist A[i,k] + Int_t j = j0; + #pragma GCC ivdep + for (; j <= jMax - 4; j += 4) { + // Unroll by 4: update C[i,j], C[i,j+1], C[i,j+2], C[i,j+3] + Element cij0 = cp[i * P + j]; + Element cij1 = cp[i * P + (j + 1)]; + Element cij2 = cp[i * P + (j + 2)]; + Element cij3 = cp[i * P + (j + 3)]; + cij0 += aik * bp[k * P + j]; + cij1 += aik * bp[k * P + (j + 1)]; + cij2 += aik * bp[k * P + (j + 2)]; + cij3 += aik * bp[k * P + (j + 3)]; + cp[i * P + j] = cij0; + cp[i * P + (j + 1)] = cij1; + cp[i * P + (j + 2)] = cij2; + cp[i * P + (j + 3)] = cij3; + } + #pragma GCC ivdep + for (; j < jMax; ++j) { + // Cleanup loop for remaining j + cp[i * P + j] += aik * bp[k * P + j]; + } + } + } } - *cp++ = cij; - bcp -= nb - 1; // Set bcp to the (j+1)-th col } - arp0 += ncolsa; // Set ap to the (i+1)-th row } } From 09e80d003c8327a917eabdf53127e2a97a3ed12d Mon Sep 17 00:00:00 2001 From: Muhammad Alhroob Date: Tue, 25 Nov 2025 15:32:25 +0100 Subject: [PATCH 2/3] clang-format AMultB function --- math/matrix/src/TMatrixT.cxx | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/math/matrix/src/TMatrixT.cxx b/math/matrix/src/TMatrixT.cxx index 12613fe84cf19..140bb95a48b1b 100644 --- a/math/matrix/src/TMatrixT.cxx +++ b/math/matrix/src/TMatrixT.cxx @@ -3076,20 +3076,20 @@ TMatrixT &TMatrixTAutoloadOps::ElementDiv(TMatrixT &target, co //////////////////////////////////////////////////////////////////////////////// /// Elementary routine to calculate matrix multiplication A*B - template void TMatrixTAutoloadOps::AMultB(const Element *const ap, Int_t na, Int_t ncolsa, const Element *const bp, Int_t nb, - Int_t ncolsb, Element *cp) { + Int_t ncolsb, Element *cp) +{ // i,k,j loop order with blocking and unrolling const Int_t M = na / ncolsa; // Rows of A - const Int_t N = ncolsa; // Columns of A, rows of B - const Int_t P = ncolsb; // Columns of B and C + const Int_t N = ncolsa; // Columns of A, rows of B + const Int_t P = ncolsb; // Columns of B and C const Int_t BLOCK = 32; - #ifdef _OPENMP - #pragma omp parallel for collapse(2) if(M * P > 10000) - #endif +#ifdef _OPENMP +#pragma omp parallel for collapse(2) if (M * P > 10000) +#endif for (Int_t i0 = 0; i0 < M; i0 += BLOCK) { for (Int_t k0 = 0; k0 < N; k0 += BLOCK) { for (Int_t j0 = 0; j0 < P; j0 += BLOCK) { @@ -3100,7 +3100,7 @@ void TMatrixTAutoloadOps::AMultB(const Element *const ap, Int_t na, Int_t ncolsa for (Int_t k = k0; k < kMax; ++k) { Element aik = ap[i * N + k]; // Hoist A[i,k] Int_t j = j0; - #pragma GCC ivdep +#pragma GCC ivdep for (; j <= jMax - 4; j += 4) { // Unroll by 4: update C[i,j], C[i,j+1], C[i,j+2], C[i,j+3] Element cij0 = cp[i * P + j]; @@ -3116,7 +3116,7 @@ void TMatrixTAutoloadOps::AMultB(const Element *const ap, Int_t na, Int_t ncolsa cp[i * P + (j + 2)] = cij2; cp[i * P + (j + 3)] = cij3; } - #pragma GCC ivdep +#pragma GCC ivdep for (; j < jMax; ++j) { // Cleanup loop for remaining j cp[i * P + j] += aik * bp[k * P + j]; From 987e29b68ca44a5a516b20e231c5b5683d8a311c Mon Sep 17 00:00:00 2001 From: Muhammad Alhroob Date: Thu, 27 Nov 2025 18:06:37 +0100 Subject: [PATCH 3/3] Bug Fix, more optimisation --- math/matrix/src/TMatrixT.cxx | 82 +++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 34 deletions(-) diff --git a/math/matrix/src/TMatrixT.cxx b/math/matrix/src/TMatrixT.cxx index 140bb95a48b1b..1916908300b8f 100644 --- a/math/matrix/src/TMatrixT.cxx +++ b/math/matrix/src/TMatrixT.cxx @@ -3080,47 +3080,61 @@ template void TMatrixTAutoloadOps::AMultB(const Element *const ap, Int_t na, Int_t ncolsa, const Element *const bp, Int_t nb, Int_t ncolsb, Element *cp) { - // i,k,j loop order with blocking and unrolling - const Int_t M = na / ncolsa; // Rows of A - const Int_t N = ncolsa; // Columns of A, rows of B - const Int_t P = ncolsb; // Columns of B and C - - const Int_t BLOCK = 32; - + const Int_t M = na / ncolsa; + const Int_t N = ncolsa; + const Int_t P = ncolsb; + + if (M <= 12 && N <= 12 && P <= 12) { + for (Int_t i = 0; i < M; ++i) { + for (Int_t j = 0; j < P; ++j) { + Element sum = Element(0); + for (Int_t k = 0; k < N; ++k) { + sum += ap[i * N + k] * bp[k * P + j]; + } + cp[i * P + j] = sum; + } + } + return; + } + const Int_t BLOCK = (M >= 192 && N >= 192 && P >= 192) ? 48 : 32; #ifdef _OPENMP #pragma omp parallel for collapse(2) if (M * P > 10000) #endif for (Int_t i0 = 0; i0 < M; i0 += BLOCK) { - for (Int_t k0 = 0; k0 < N; k0 += BLOCK) { - for (Int_t j0 = 0; j0 < P; j0 += BLOCK) { - const Int_t iMax = (i0 + BLOCK < M) ? i0 + BLOCK : M; - const Int_t kMax = (k0 + BLOCK < N) ? k0 + BLOCK : N; - const Int_t jMax = (j0 + BLOCK < P) ? j0 + BLOCK : P; - for (Int_t i = i0; i < iMax; ++i) { - for (Int_t k = k0; k < kMax; ++k) { - Element aik = ap[i * N + k]; // Hoist A[i,k] + for (Int_t j0 = 0; j0 < P; j0 += BLOCK) { + const Int_t i1 = (i0 + BLOCK < M) ? i0 + BLOCK : M; + const Int_t j1 = (j0 + BLOCK < P) ? j0 + BLOCK : P; + for (Int_t i = i0; i < i1; ++i) { + Int_t j = j0; + for (; j <= j1 - 4; j += 4) { + cp[i * P + j + 0] = Element(0); + cp[i * P + j + 1] = Element(0); + cp[i * P + j + 2] = Element(0); + cp[i * P + j + 3] = Element(0); + } + for (; j < j1; ++j) + cp[i * P + j] = Element(0); + } + + // ────────────────────── Accumulate over k blocks ────────────────────── + for (Int_t k0 = 0; k0 < N; k0 += BLOCK) { + const Int_t k1 = (k0 + BLOCK < N) ? k0 + BLOCK : N; + + for (Int_t i = i0; i < i1; ++i) { + for (Int_t k = k0; k < k1; ++k) { + const Element aik = ap[i * N + k]; + Int_t j = j0; -#pragma GCC ivdep - for (; j <= jMax - 4; j += 4) { - // Unroll by 4: update C[i,j], C[i,j+1], C[i,j+2], C[i,j+3] - Element cij0 = cp[i * P + j]; - Element cij1 = cp[i * P + (j + 1)]; - Element cij2 = cp[i * P + (j + 2)]; - Element cij3 = cp[i * P + (j + 3)]; - cij0 += aik * bp[k * P + j]; - cij1 += aik * bp[k * P + (j + 1)]; - cij2 += aik * bp[k * P + (j + 2)]; - cij3 += aik * bp[k * P + (j + 3)]; - cp[i * P + j] = cij0; - cp[i * P + (j + 1)] = cij1; - cp[i * P + (j + 2)] = cij2; - cp[i * P + (j + 3)] = cij3; + // Main 4-wide accumulation + for (; j <= j1 - 4; j += 4) { + cp[i * P + j + 0] += aik * bp[k * P + j + 0]; + cp[i * P + j + 1] += aik * bp[k * P + j + 1]; + cp[i * P + j + 2] += aik * bp[k * P + j + 2]; + cp[i * P + j + 3] += aik * bp[k * P + j + 3]; } -#pragma GCC ivdep - for (; j < jMax; ++j) { - // Cleanup loop for remaining j + // Remainder + for (; j < j1; ++j) cp[i * P + j] += aik * bp[k * P + j]; - } } } }