From 51c1bd12eac0ed992a66f0bc89350fcab2c33206 Mon Sep 17 00:00:00 2001
From: jongyunhur <jongyun.jh@gmail.com>
Date: Tue, 25 Nov 2025 16:20:20 +0900
Subject: [PATCH] Fix: Correct cuBLAS matrix multiplication order for Row-Major
 inputs

- Swapped multiplication order to B * A and dimensions to (n, m).
- Added comments explaining Row-Major to Column-Major handling.
---
 nvml_cublas.cu | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/nvml_cublas.cu b/nvml_cublas.cu
index 0ba9ea5..df02444 100644
--- a/nvml_cublas.cu
+++ b/nvml_cublas.cu
@@ -104,6 +104,9 @@ void calculate( int const &m, int const &n, int const &k, nvmlClass &nvml ) {
 
     /* Fill the matrices with test data */
     /* Assume square matrices */
+    /* Note: We generate data in Row-Major format, but cuBLAS interprets it as Column-Major.
+     * This means matrices are effectively transposed when passed to cuBLAS.
+     * To compensate, we will compute C = B × A instead of C = A × B. */
     for ( int i = 0; i < m * m; i++ ) {
         h_A[i] = std::rand( ) / static_cast<data_type>( RAND_MAX );
         h_B[i] = std::rand( ) / static_cast<data_type>( RAND_MAX );
@@ -124,7 +127,8 @@ void calculate( int const &m, int const &n, int const &k, nvmlClass &nvml ) {
     data_type *d_C_ptr = thrust::raw_pointer_cast( &d_C[0] );
 
     /* Performs operation using cublas */
-    cublasSgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, d_A_ptr, lda, d_B_ptr, ldb, &beta, d_C_ptr, ldc );
+    /* Compute C = B * A to compensate for Row-Major to Column-Major mismatch. */
+    cublasSgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, d_B_ptr, ldb, d_A_ptr, lda, &beta, d_C_ptr, ldc );
     CUDA_RT_CALL( cudaDeviceSynchronize( ) );
 
     /* Allocate host memory for reading back the result from device memory */