From e52b66d3bf97daef808493ed53f1fb1d2af35573 Mon Sep 17 00:00:00 2001
From: Muhammad Alhroob <muhammad.alhroob@cern.ch>
Date: Mon, 10 Nov 2025 11:02:37 +0100
Subject: [PATCH 1/3] Rewrite the matrix multplication function. rearrange the
 loops and use blocking to enhance the speed (a factor of 6 for very large
 matrices)

---
 math/matrix/src/TMatrixT.cxx | 58 ++++++++++++++++++++++++++++--------
 1 file changed, 45 insertions(+), 13 deletions(-)
diff --git a/math/matrix/src/TMatrixT.cxx b/math/matrix/src/TMatrixT.cxx
index ad07f1ed5c1a5..12613fe84cf19 100644
--- a/math/matrix/src/TMatrixT.cxx
+++ b/math/matrix/src/TMatrixT.cxx
@@ -3076,23 +3076,55 @@ TMatrixT<Element> &TMatrixTAutoloadOps::ElementDiv(TMatrixT<Element> &target, co
 ////////////////////////////////////////////////////////////////////////////////
 /// Elementary routine to calculate matrix multiplication A*B
 
+
 template <class Element>
 void TMatrixTAutoloadOps::AMultB(const Element *const ap, Int_t na, Int_t ncolsa, const Element *const bp, Int_t nb,
-                                 Int_t ncolsb, Element *cp)
-{
-   const Element *arp0 = ap; // Pointer to  A[i,0];
-   while (arp0 < ap + na) {
-      for (const Element *bcp = bp; bcp < bp + ncolsb;) { // Pointer to the j-th column of B, Start bcp = B[0,0]
-         const Element *arp = arp0;                       // Pointer to the i-th row of A, reset to A[i,0]
-         Element cij = 0;
-         while (bcp < bp + nb) {  // Scan the i-th row of A and
-            cij += *arp++ * *bcp; // the j-th col of B
-            bcp += ncolsb;
+                                 Int_t ncolsb, Element *cp) {
+   // i,k,j loop order with blocking and unrolling
+   const Int_t M = na / ncolsa; // Rows of A
+   const Int_t N = ncolsa;	// Columns of A, rows of B
+   const Int_t P = ncolsb;	// Columns of B and C
+
+   const Int_t BLOCK = 32;
+
+   #ifdef _OPENMP
+   #pragma omp parallel for collapse(2) if(M * P > 10000)
+   #endif
+   for (Int_t i0 = 0; i0 < M; i0 += BLOCK) {
+      for (Int_t k0 = 0; k0 < N; k0 += BLOCK) {
+         for (Int_t j0 = 0; j0 < P; j0 += BLOCK) {
+            const Int_t iMax = (i0 + BLOCK < M) ? i0 + BLOCK : M;
+            const Int_t kMax = (k0 + BLOCK < N) ? k0 + BLOCK : N;
+            const Int_t jMax = (j0 + BLOCK < P) ? j0 + BLOCK : P;
+            for (Int_t i = i0; i < iMax; ++i) {
+               for (Int_t k = k0; k < kMax; ++k) {
+                  Element aik = ap[i * N + k]; // Hoist A[i,k]
+                  Int_t j = j0;
+                  #pragma GCC ivdep
+                  for (; j <= jMax - 4; j += 4) {
+                     // Unroll by 4: update C[i,j], C[i,j+1], C[i,j+2], C[i,j+3]
+                     Element cij0 = cp[i * P + j];
+                     Element cij1 = cp[i * P + (j + 1)];
+                     Element cij2 = cp[i * P + (j + 2)];
+                     Element cij3 = cp[i * P + (j + 3)];
+                     cij0 += aik * bp[k * P + j];
+                     cij1 += aik * bp[k * P + (j + 1)];
+                     cij2 += aik * bp[k * P + (j + 2)];
+                     cij3 += aik * bp[k * P + (j + 3)];
+                     cp[i * P + j] = cij0;
+                     cp[i * P + (j + 1)] = cij1;
+                     cp[i * P + (j + 2)] = cij2;
+                     cp[i * P + (j + 3)] = cij3;
+                  }
+                  #pragma GCC ivdep
+                  for (; j < jMax; ++j) {
+                     // Cleanup loop for remaining j
+                     cp[i * P + j] += aik * bp[k * P + j];
+                  }
+               }
+            }
          }
-         *cp++ = cij;
-         bcp -= nb - 1; // Set bcp to the (j+1)-th col
       }
-      arp0 += ncolsa; // Set ap to the (i+1)-th row
    }
 }
 

From 09e80d003c8327a917eabdf53127e2a97a3ed12d Mon Sep 17 00:00:00 2001
From: Muhammad Alhroob <muhammad.alhroob@cern.ch>
Date: Tue, 25 Nov 2025 15:32:25 +0100
Subject: [PATCH 2/3] clang-format AMultB function

---
 math/matrix/src/TMatrixT.cxx | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/math/matrix/src/TMatrixT.cxx b/math/matrix/src/TMatrixT.cxx
index 12613fe84cf19..140bb95a48b1b 100644
--- a/math/matrix/src/TMatrixT.cxx
+++ b/math/matrix/src/TMatrixT.cxx
@@ -3076,20 +3076,20 @@ TMatrixT<Element> &TMatrixTAutoloadOps::ElementDiv(TMatrixT<Element> &target, co
 ////////////////////////////////////////////////////////////////////////////////
 /// Elementary routine to calculate matrix multiplication A*B
 
-
 template <class Element>
 void TMatrixTAutoloadOps::AMultB(const Element *const ap, Int_t na, Int_t ncolsa, const Element *const bp, Int_t nb,
-                                 Int_t ncolsb, Element *cp) {
+                                 Int_t ncolsb, Element *cp)
+{
    // i,k,j loop order with blocking and unrolling
    const Int_t M = na / ncolsa; // Rows of A
-   const Int_t N = ncolsa;	// Columns of A, rows of B
-   const Int_t P = ncolsb;	// Columns of B and C
+   const Int_t N = ncolsa;      // Columns of A, rows of B
+   const Int_t P = ncolsb;      // Columns of B and C
 
    const Int_t BLOCK = 32;
 
-   #ifdef _OPENMP
-   #pragma omp parallel for collapse(2) if(M * P > 10000)
-   #endif
+#ifdef _OPENMP
+#pragma omp parallel for collapse(2) if (M * P > 10000)
+#endif
    for (Int_t i0 = 0; i0 < M; i0 += BLOCK) {
       for (Int_t k0 = 0; k0 < N; k0 += BLOCK) {
          for (Int_t j0 = 0; j0 < P; j0 += BLOCK) {
@@ -3100,7 +3100,7 @@ void TMatrixTAutoloadOps::AMultB(const Element *const ap, Int_t na, Int_t ncolsa
                for (Int_t k = k0; k < kMax; ++k) {
                   Element aik = ap[i * N + k]; // Hoist A[i,k]
                   Int_t j = j0;
-                  #pragma GCC ivdep
+#pragma GCC ivdep
                   for (; j <= jMax - 4; j += 4) {
                      // Unroll by 4: update C[i,j], C[i,j+1], C[i,j+2], C[i,j+3]
                      Element cij0 = cp[i * P + j];
@@ -3116,7 +3116,7 @@ void TMatrixTAutoloadOps::AMultB(const Element *const ap, Int_t na, Int_t ncolsa
                      cp[i * P + (j + 2)] = cij2;
                      cp[i * P + (j + 3)] = cij3;
                   }
-                  #pragma GCC ivdep
+#pragma GCC ivdep
                   for (; j < jMax; ++j) {
                      // Cleanup loop for remaining j
                      cp[i * P + j] += aik * bp[k * P + j];

From 987e29b68ca44a5a516b20e231c5b5683d8a311c Mon Sep 17 00:00:00 2001
From: Muhammad Alhroob <muhammad.alhroob@cern.ch>
Date: Thu, 27 Nov 2025 18:06:37 +0100
Subject: [PATCH 3/3] Bug Fix, more optimisation

---
 math/matrix/src/TMatrixT.cxx | 82 +++++++++++++++++++++---------------
 1 file changed, 48 insertions(+), 34 deletions(-)

diff --git a/math/matrix/src/TMatrixT.cxx b/math/matrix/src/TMatrixT.cxx
index 140bb95a48b1b..1916908300b8f 100644
--- a/math/matrix/src/TMatrixT.cxx
+++ b/math/matrix/src/TMatrixT.cxx
@@ -3080,47 +3080,61 @@ template <class Element>
 void TMatrixTAutoloadOps::AMultB(const Element *const ap, Int_t na, Int_t ncolsa, const Element *const bp, Int_t nb,
                                  Int_t ncolsb, Element *cp)
 {
-   // i,k,j loop order with blocking and unrolling
-   const Int_t M = na / ncolsa; // Rows of A
-   const Int_t N = ncolsa;      // Columns of A, rows of B
-   const Int_t P = ncolsb;      // Columns of B and C
-
-   const Int_t BLOCK = 32;
-
+   const Int_t M = na / ncolsa;
+   const Int_t N = ncolsa;
+   const Int_t P = ncolsb;
+
+   if (M <= 12 && N <= 12 && P <= 12) {
+      for (Int_t i = 0; i < M; ++i) {
+         for (Int_t j = 0; j < P; ++j) {
+            Element sum = Element(0);
+            for (Int_t k = 0; k < N; ++k) {
+               sum += ap[i * N + k] * bp[k * P + j];
+            }
+            cp[i * P + j] = sum;
+         }
+      }
+      return;
+   }
+   const Int_t BLOCK = (M >= 192 && N >= 192 && P >= 192) ? 48 : 32;
 #ifdef _OPENMP
 #pragma omp parallel for collapse(2) if (M * P > 10000)
 #endif
    for (Int_t i0 = 0; i0 < M; i0 += BLOCK) {
-      for (Int_t k0 = 0; k0 < N; k0 += BLOCK) {
-         for (Int_t j0 = 0; j0 < P; j0 += BLOCK) {
-            const Int_t iMax = (i0 + BLOCK < M) ? i0 + BLOCK : M;
-            const Int_t kMax = (k0 + BLOCK < N) ? k0 + BLOCK : N;
-            const Int_t jMax = (j0 + BLOCK < P) ? j0 + BLOCK : P;
-            for (Int_t i = i0; i < iMax; ++i) {
-               for (Int_t k = k0; k < kMax; ++k) {
-                  Element aik = ap[i * N + k]; // Hoist A[i,k]
+      for (Int_t j0 = 0; j0 < P; j0 += BLOCK) {
+         const Int_t i1 = (i0 + BLOCK < M) ? i0 + BLOCK : M;
+         const Int_t j1 = (j0 + BLOCK < P) ? j0 + BLOCK : P;
+         for (Int_t i = i0; i < i1; ++i) {
+            Int_t j = j0;
+            for (; j <= j1 - 4; j += 4) {
+               cp[i * P + j + 0] = Element(0);
+               cp[i * P + j + 1] = Element(0);
+               cp[i * P + j + 2] = Element(0);
+               cp[i * P + j + 3] = Element(0);
+            }
+            for (; j < j1; ++j)
+               cp[i * P + j] = Element(0);
+         }
+
+         // ────────────────────── Accumulate over k blocks ──────────────────────
+         for (Int_t k0 = 0; k0 < N; k0 += BLOCK) {
+            const Int_t k1 = (k0 + BLOCK < N) ? k0 + BLOCK : N;
+
+            for (Int_t i = i0; i < i1; ++i) {
+               for (Int_t k = k0; k < k1; ++k) {
+                  const Element aik = ap[i * N + k];
+
                   Int_t j = j0;
-#pragma GCC ivdep
-                  for (; j <= jMax - 4; j += 4) {
-                     // Unroll by 4: update C[i,j], C[i,j+1], C[i,j+2], C[i,j+3]
-                     Element cij0 = cp[i * P + j];
-                     Element cij1 = cp[i * P + (j + 1)];
-                     Element cij2 = cp[i * P + (j + 2)];
-                     Element cij3 = cp[i * P + (j + 3)];
-                     cij0 += aik * bp[k * P + j];
-                     cij1 += aik * bp[k * P + (j + 1)];
-                     cij2 += aik * bp[k * P + (j + 2)];
-                     cij3 += aik * bp[k * P + (j + 3)];
-                     cp[i * P + j] = cij0;
-                     cp[i * P + (j + 1)] = cij1;
-                     cp[i * P + (j + 2)] = cij2;
-                     cp[i * P + (j + 3)] = cij3;
+                  // Main 4-wide accumulation
+                  for (; j <= j1 - 4; j += 4) {
+                     cp[i * P + j + 0] += aik * bp[k * P + j + 0];
+                     cp[i * P + j + 1] += aik * bp[k * P + j + 1];
+                     cp[i * P + j + 2] += aik * bp[k * P + j + 2];
+                     cp[i * P + j + 3] += aik * bp[k * P + j + 3];
                   }
-#pragma GCC ivdep
-                  for (; j < jMax; ++j) {
-                     // Cleanup loop for remaining j
+                  // Remainder
+                  for (; j < j1; ++j)
                      cp[i * P + j] += aik * bp[k * P + j];
-                  }
                }
             }
          }