diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000..d5e0157
Binary files /dev/null and b/.DS_Store differ
diff --git a/.clang-format b/.clang-format
index 2aec894..276a9db 100644
--- a/.clang-format
+++ b/.clang-format
@@ -2,4 +2,5 @@ BasedOnStyle: Google
DerivePointerAlignment: false
PointerAlignment: true
-Standard: C++11
\ No newline at end of file
+AlignAfterOpenBracket: BlockIndent
+Standard: C++20
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index da20b26..c2b88e6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -56,5 +56,15 @@ gpu-blob
CSV*
Graphs*
-# VS Code
-.vscode
\ No newline at end of file
+# IDE
+.vscode
+
+# MAC metadata
+.DS_Store
+
+# CSV files and graphs
+*.csv
+*.png
+
+# Bash scripts to run on different systems
+*.sh
diff --git a/.idea/GPU-BLAS-Offload-Benchmark.iml b/.idea/GPU-BLAS-Offload-Benchmark.iml
new file mode 100644
index 0000000..190534e
--- /dev/null
+++ b/.idea/GPU-BLAS-Offload-Benchmark.iml
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/.idea/codeStyles/codeStyleConfig.xml b/.idea/codeStyles/codeStyleConfig.xml
new file mode 100644
index 0000000..a55e7a1
--- /dev/null
+++ b/.idea/codeStyles/codeStyleConfig.xml
@@ -0,0 +1,5 @@
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..830d3c8
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..eff3984
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
new file mode 100644
index 0000000..461bf83
--- /dev/null
+++ b/.idea/workspace.xml
@@ -0,0 +1,628 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {
+ "associatedIndex": 2
+}
+
+
+
+
+
+ {
+ "keyToString": {
+ "C/C++ File.main.cc.executor": "Run",
+ "RunOnceActivity.OpenProjectViewOnStart": "true",
+ "RunOnceActivity.ShowReadmeOnStart": "true",
+ "RunOnceActivity.cidr.known.project.marker": "true",
+ "RunOnceActivity.readMode.enableVisualFormatting": "true",
+ "cf.advertisement.text.has.clang-format": "true",
+ "cf.first.check.clang-format": "false",
+ "cidr.known.project.marker": "true",
+ "git-widget-placeholder": "sparse",
+ "last_opened_file_path": "/Users/no22498/Documents/GPU-BLAS-Offload-Benchmark",
+ "node.js.detected.package.eslint": "true",
+ "node.js.detected.package.tslint": "true",
+ "node.js.selected.package.eslint": "(autodetect)",
+ "node.js.selected.package.tslint": "(autodetect)",
+ "nodejs_package_manager_path": "npm",
+ "settings.editor.selected.configurable": "preferences.sourceCode.C/C++",
+ "structure.view.defaults.are.configured": "true",
+ "vue.rearranger.settings.migration": "true"
+ }
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 1705671236426
+
+
+ 1705671236426
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 1729522244950
+
+
+
+ 1729522244950
+
+
+
+ 1735823512058
+
+
+
+ 1735823512058
+
+
+
+ 1736268772766
+
+
+
+ 1736268772766
+
+
+
+ 1736345071717
+
+
+
+ 1736345071717
+
+
+
+ 1736437501127
+
+
+
+ 1736437501127
+
+
+
+ 1736855919103
+
+
+
+ 1736855919103
+
+
+
+ 1749114455274
+
+
+
+ 1749114455275
+
+
+
+ 1749207243302
+
+
+
+ 1749207243303
+
+
+
+ 1749208200109
+
+
+
+ 1749208200109
+
+
+
+ 1749216253738
+
+
+
+ 1749216253738
+
+
+
+ 1749220180859
+
+
+
+ 1749220180859
+
+
+
+ 1749222039569
+
+
+
+ 1749222039569
+
+
+
+ 1749222185024
+
+
+
+ 1749222185024
+
+
+
+ 1749224055692
+
+
+
+ 1749224055692
+
+
+
+ 1749224825111
+
+
+
+ 1749224825111
+
+
+
+ 1749299101343
+
+
+
+ 1749299101343
+
+
+
+ 1749300339857
+
+
+
+ 1749300339857
+
+
+
+ 1749300581723
+
+
+
+ 1749300581723
+
+
+
+ 1749302000063
+
+
+
+ 1749302000063
+
+
+
+ 1749302760970
+
+
+
+ 1749302760970
+
+
+
+ 1749304062687
+
+
+
+ 1749304062687
+
+
+
+ 1749304719210
+
+
+
+ 1749304719210
+
+
+
+ 1749459873227
+
+
+
+ 1749459873227
+
+
+
+ 1749461033029
+
+
+
+ 1749461033029
+
+
+
+ 1749464210208
+
+
+
+ 1749464210208
+
+
+
+ 1749466409343
+
+
+
+ 1749466409343
+
+
+
+ 1749483888557
+
+
+
+ 1749483888558
+
+
+
+ 1749484807520
+
+
+
+ 1749484807520
+
+
+
+ 1749485968750
+
+
+
+ 1749485968750
+
+
+
+ 1749550630684
+
+
+
+ 1749550630684
+
+
+
+ 1749553951713
+
+
+
+ 1749553951713
+
+
+
+ 1749555731977
+
+
+
+ 1749555731977
+
+
+
+ 1749557799668
+
+
+
+ 1749557799668
+
+
+
+ 1749558088187
+
+
+
+ 1749558088188
+
+
+
+ 1749560123260
+
+
+
+ 1749560123260
+
+
+
+ 1749571193282
+
+
+
+ 1749571193282
+
+
+
+ 1749634004891
+
+
+
+ 1749634004891
+
+
+
+ 1749634821436
+
+
+
+ 1749634821436
+
+
+
+ 1749636983737
+
+
+
+ 1749636983737
+
+
+
+ 1749649606510
+
+
+
+ 1749649606510
+
+
+
+ 1749650425306
+
+
+
+ 1749650425306
+
+
+
+ 1749650457446
+
+
+
+ 1749650457447
+
+
+
+ 1749650996166
+
+
+
+ 1749650996166
+
+
+
+ 1749651167150
+
+
+
+ 1749651167150
+
+
+
+ 1749739754825
+
+
+
+ 1749739754825
+
+
+
+ 1749740728357
+
+
+
+ 1749740728357
+
+
+
+ 1750062944488
+
+
+
+ 1750062944488
+
+
+
+ 1750077193292
+
+
+
+ 1750077193292
+
+
+
+ 1750348948237
+
+
+
+ 1750348948237
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/AOCL/gemm.hh b/AOCL/gemm.hh
index 3c6b5c0..f418bdc 100644
--- a/AOCL/gemm.hh
+++ b/AOCL/gemm.hh
@@ -23,6 +23,7 @@ class gemm_cpu : public gemm {
private:
/** Make call to the GEMM kernel. */
void callGemm() override {
+
if constexpr (std::is_same_v) {
bli_sgemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, m_, n_, k_, &alpha, A_,
rowStride, std::max(1, m_), B_, rowStride, std::max(1, k_),
diff --git a/AOCL/spmdnm.hh b/AOCL/spmdnm.hh
new file mode 100644
index 0000000..f47007c
--- /dev/null
+++ b/AOCL/spmdnm.hh
@@ -0,0 +1,336 @@
+#pragma once
+
+#ifdef CPU_AOCL
+#include "aoclsparse.h"
+
+#include
+#include
+
+#include "../include/kernels/CPU/spmdnm.hh"
+#include "../include/utilities.hh"
+
+namespace cpu {
+template
+class spmdnm_cpu : public spmdnm {
+public:
+ using spmdnm::spmdnm;
+ using spmdnm::callConsume;
+ using spmdnm::initInputMatrices;
+ using spmdnm::m_;
+ using spmdnm::n_;
+ using spmdnm::k_;
+ using spmdnm::B_;
+ using spmdnm::C_;
+ using spmdnm::sparsity_;
+ using spmdnm::type_;
+ using spmdnm::nnz_;
+ using spmdnm::iterations_;
+
+ void initialise(int m, int n, int k, double sparsity,
+ matrixType type, bool binary = false) {
+ base_ = aoclsparse_index_base_zero;
+ order_ = aoclsparse_order_row;
+
+ status_ = aoclsparse_create_mat_descr(&A_description_);
+ if (status_ != aoclsparse_status_success) {
+ std::cerr << "aoclsparse_create_mat_descr is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl;
+ printAOCLError(status_);
+ }
+
+ status_ = aoclsparse_set_mat_index_base(A_description_, base_);
+ if (status_ != aoclsparse_status_success) {
+ std::cerr << "aoclsparse_set_mat_index_base is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl;
+ printAOCLError(status_);
+ }
+
+
+ m_aocl_ = m_ = m;
+ n_aocl_ = n_ = n;
+ k_aocl_ = k_ = k;
+ sparsity_ = sparsity;
+ type_ = type;
+
+ nnz_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_));
+ nnz_aocl_ = nnz_;
+
+ B_ = (T*)calloc(k_ * n_, sizeof(T));
+ C_ = (T*)calloc(m_ * n_, sizeof(T));
+
+ initInputMatrices();
+ }
+
+protected:
+ void toSparseFormat() override {
+
+ // Initialise datastructures for the CSR format
+ A_rows_ = new aoclsparse_int[m_ + 1];
+ A_cols_ = new aoclsparse_int[nnz_aocl_];
+ A_vals_ = new T[nnz_aocl_];
+
+ if (type_ == matrixType::rmat) {
+ rMatCSR(A_vals_, A_cols_, A_rows_, m_, k_, nnz_);
+ } else if (type_ == matrixType::random) {
+ randomCSR(A_vals_, A_cols_, A_rows_, m_, k_, nnz_);
+ } else if (type_ == matrixType::finiteElements) {
+ finiteElementCSR(A_vals_, A_cols_, A_rows_, m_, k_, nnz_);
+ } else {
+ std::cerr << "Matrix type not supported" << std::endl;
+ exit(1);
+ }
+
+ // Move into the AOCL CSR matrix handle
+ if constexpr (std::is_same_v) {
+ status_ = aoclsparse_create_scsr(&A_aocl_,
+ base_,
+ m_aocl_,
+ k_aocl_,
+ nnz_aocl_,
+ A_rows_,
+ A_cols_,
+ A_vals_);
+ } else if constexpr (std::is_same_v) {
+ status_ = aoclsparse_create_dcsr(&A_aocl_,
+ base_,
+ m_aocl_,
+ k_aocl_,
+ nnz_aocl_,
+ A_rows_,
+ A_cols_,
+ A_vals_);
+ }
+ if (status_ != aoclsparse_status_success) {
+ std::cerr << "aoclsparse_create_?csr is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl;
+ printAOCLError(status_);
+ }
+ }
+
+private:
+ void preLoopRequirements() override {}
+
+ void callSpmdnm() override {
+ operation_ = aoclsparse_operation_none; // Just saying no transposition happening first
+ if constexpr (std::is_same_v) {
+ status_ = aoclsparse_scsrmm(operation_,
+ alpha,
+ A_aocl_,
+ A_description_,
+ order_,
+ B_,
+ n_aocl_,
+ n_aocl_,
+ beta,
+ C_,
+ n_aocl_);
+ } else if constexpr(std::is_same_v) {
+ status_ = aoclsparse_dcsrmm(operation_,
+ alpha,
+ A_aocl_,
+ A_description_,
+ order_,
+ B_,
+ n_aocl_,
+ n_aocl_,
+ beta,
+ C_,
+ n_aocl_);
+ }
+ if (status_ != aoclsparse_status_success) {
+ std::cerr << "aoclsparse_?csrmm is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl;
+ std::cerr << "\tm_aocl_=" << m_aocl_ << std::endl;
+ std::cerr << "\tn_aocl_=" << n_aocl_ << std::endl;
+ std::cerr << "\tk_aocl_=" << k_aocl_ << std::endl;
+ std::cerr << "\tnnz_aocl_=" << nnz_aocl_ << std::endl;
+ printAOCLError(status_);
+ }
+ }
+
+ void postLoopRequirements() override {
+ }
+
+ void postCallKernelCleanup() override {
+ status_ = aoclsparse_destroy_mat_descr(A_description_);
+ if (status_ != aoclsparse_status_success) {
+ std::cerr << "aoclsparse_destroy_mat_descr is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl;
+ printAOCLError(status_);
+ }
+ status_ = aoclsparse_destroy(&A_aocl_);
+ if (status_ != aoclsparse_status_success) {
+ std::cerr << "aoclsparse_destroy is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl;
+ printAOCLError(status_);
+ }
+ delete[] A_vals_;
+ delete[] A_cols_;
+ delete[] A_rows_;
+ delete[] B_;
+ delete[] C_;
+ }
+
+ void printAOCLError(aoclsparse_status stat) {
+ switch (stat) {
+ case aoclsparse_status_success:
+ std::cerr << "SUCCESS - The operation completed successfully";
+ break;
+ case aoclsparse_status_not_implemented:
+ std::cerr << "NOT_IMPLEMENTED - The requested functionality is not yet implemented in this version";
+ break;
+ case aoclsparse_status_invalid_pointer:
+ std::cerr << "INVALID_POINTER - One or more pointer parameters are NULL or otherwise invalid";
+ break;
+ case aoclsparse_status_invalid_size:
+ std::cerr << "INVALID_SIZE - One or more size parameters (m, n, nnz, etc.) contain an invalid value (e.g., negative or zero where positive required)";
+ break;
+ case aoclsparse_status_internal_error:
+ std::cerr << "INTERNAL_ERROR - Internal library failure";
+ break;
+ case aoclsparse_status_invalid_value:
+ std::cerr << "INVALID_VALUE - Input parameters contain an invalid value (e.g., invalid enum value, base index neither 0 nor 1)";
+ break;
+ case aoclsparse_status_invalid_index_value:
+ std::cerr << "INVALID_INDEX_VALUE - At least one index value is invalid (e.g., negative or out of bounds)";
+ break;
+ case aoclsparse_status_maxit:
+ std::cerr << "MAXIT - function stopped after reaching number of iteration limit";
+ break;
+ case aoclsparse_status_user_stop:
+ std::cerr << "USER_STOP - user requested termination";
+ break;
+ case aoclsparse_status_wrong_type:
+ std::cerr << "WRONG_TYPE - Data type mismatch (e.g., matrix datatypes don't match between operations)";
+ break;
+ case aoclsparse_status_memory_error:
+ std::cerr << "MEMORY_ERROR - memory allocation failure";
+ break;
+ case aoclsparse_status_numerical_error:
+ std::cerr << "NUMERICAL_ERROR - numerical error, e.g., matrix is not positive definite, devide-by-zero error";
+ break;
+ case aoclsparse_status_invalid_operation:
+ std::cerr << "INVALID_OPERATION - cannot proceed with the request at this point";
+ break;
+ case aoclsparse_status_unsorted_input:
+ std::cerr << "UNSORTED_INPUT - the input matrices are not sorted";
+ break;
+ case aoclsparse_status::aoclsparse_status_invalid_kid:
+ std::cerr << "INVALID_KID - user requested kernel id was not available";
+ break;
+ default:
+ std::cerr << "UNKNOWN_STATUS - Unrecognized status code (" + std::to_string(stat) + ")";
+ break;
+ }
+ std::cerr << std::endl;
+ exit(1);
+ }
+
+ void internalCheck(aoclsparse_int maj_dim,
+ aoclsparse_int min_dim,
+ aoclsparse_int nnz,
+ const aoclsparse_int *idx_ptr,
+ const aoclsparse_int *indices,
+ const void *val,
+ int shape,
+ int base) {
+ if (idx_ptr == nullptr) {
+ std::cerr << "INVALID ROWS ARRAY" << std::endl;
+ exit(1);
+ }
+ if (indices == nullptr){
+ std::cerr << "INVALID COLS ARRAY" << std::endl;
+ exit(1);
+ }
+ if (val == nullptr){
+ std::cerr << "INVALID VALS ARRAY" << std::endl;
+ exit(1);
+ }
+
+ if ((min_dim < 0) || (maj_dim < 0) || (nnz < 0)) {
+ std::cerr << "Wrong min_dim/maj_dim/nnz" << std::endl;
+ exit(1);
+ }
+
+ if ((idx_ptr[0] - base) != 0) {
+ std::cerr << "Wrong csr_row_ptr[0] or csc.col_ptr[0]" << std::endl;
+ exit(1);
+ }
+
+ if ((idx_ptr[maj_dim] - base) != nnz) {
+ std::cerr << "Wrong csr_row_ptr[m]!=nnz or csc.col_ptr[n]!=nnz" << std::endl;
+ exit(1);
+ }
+ for (aoclsparse_int i = 1; i <= maj_dim; i++) {
+ if (idx_ptr[i - 1] > idx_ptr[i]) {
+ std::cerr << "Wrong csr_row_ptr/csc.col_ptr - not nondecreasing" << std::endl;
+ exit (1);
+ }
+ }
+
+ // assume indices are fully sorted & fulldiag matrix unless proved otherwise
+ int sort = 1;
+ bool fulldiag = true;
+
+ aoclsparse_int idxstart, idxend, j, jmin = 0, jmax = min_dim - 1;
+ for (aoclsparse_int i = 0; i < maj_dim; i++) {
+ idxend = idx_ptr[i + 1] - base;
+ idxstart = idx_ptr[i] - base;
+ if (shape == 1) {
+ jmin = 0;
+ jmax = i;
+ } else if (shape == 2) {
+ jmin = i;
+ jmax = min_dim - 1;
+ }
+ // check if visited D, U group within this row
+ bool diagonal = false, upper = false;
+ aoclsparse_int prev = -1; // holds previous col index, initially set to -1
+
+ for (aoclsparse_int idx = idxstart; idx < idxend; idx++) {
+ j = indices[idx] - base;
+ if (j < jmin || j > jmax) {
+ std::cerr << "Wrong index - out of bounds or triangle, @idx=" << idx << ": j=" << j << ", i=" << i << std::endl;
+ exit(1);
+ }
+ // check for sorting pattern for each element in a row
+ if (sort != 3) {
+ if (prev > j) sort = 2; // unsorted col idx (duplicate elements are allowed)
+ else prev = j; // update previous col index
+
+ // check for group-order
+ if ((j <= i && upper) || (j < i && diagonal)) sort = 3;
+ }
+ if (j > i) upper = true;
+ else if(j == i) {
+ if (diagonal) {
+ std::cerr << "Wrong diag - duplicate diag for i=j=" << i << std::endl;
+ exit(1);
+ }
+ // diagonal element visited
+ diagonal = true;
+ }
+ }
+ if (!diagonal && i < min_dim) fulldiag = false; // missing diagonal
+ }
+ }
+
+ aoclsparse_status status_;
+ aoclsparse_order order_;
+
+ aoclsparse_operation operation_;
+ aoclsparse_index_base base_;
+
+ aoclsparse_mat_descr A_description_;
+ aoclsparse_matrix A_aocl_;
+ aoclsparse_int* A_rows_;
+ aoclsparse_int* A_cols_;
+ T* A_vals_;
+
+ aoclsparse_int m_aocl_;
+ aoclsparse_int n_aocl_;
+ aoclsparse_int k_aocl_;
+ aoclsparse_int nnz_aocl_;
+
+
+ const T alpha = ALPHA;
+ const T beta = BETA;
+};
+}
+
+
+#endif
diff --git a/AOCL/spmdnv.hh b/AOCL/spmdnv.hh
new file mode 100644
index 0000000..d529713
--- /dev/null
+++ b/AOCL/spmdnv.hh
@@ -0,0 +1,273 @@
+#pragma once
+
+#ifdef CPU_AOCL
+
+#include "aoclsparse.h"
+#include
+
+#include "../include/kernels/CPU/spmdnv.hh"
+#include "../include/utilities.hh"
+
+namespace cpu {
+template
+class spmdnv_cpu : public spmdnv {
+public:
+ using spmdnv::spmdnv;
+ using spmdnv::callConsume;
+ using spmdnv::initInputMatrixVector;
+ using spmdnv::m_;
+ using spmdnv::n_;
+ using spmdnv::x_;
+ using spmdnv::y_;
+ using spmdnv::sparsity_;
+ using spmdnv::type_;
+ using spmdnv::nnz_;
+ using spmdnv::iterations_;
+
+ void initialise(int m, int n, double sparsity, matrixType type,
+ bool binary = false) {
+ if (print_) std::cout << "=========== Matrix = " << m << "x" << n << " ===========" << std::endl;
+ base_ = aoclsparse_index_base_zero;
+ operation_ = aoclsparse_operation_none;
+
+ m_aocl_ = m_ = m;
+ n_aocl_ = n_ = n;
+ sparsity_ = sparsity;
+ type_ = type;
+
+ nnz_ = 1 + (uint64_t)((double)m_ * (double)n_ * (1.0 - sparsity_));
+ nnz_aocl_ = nnz_;
+
+ x_ = (T*)calloc(n_, sizeof(T));
+ y_ = (T*)calloc(m_, sizeof(T));
+
+ if (print_) std::cout << "About to initialise matrices" << std::endl;
+ initInputMatrixVector();
+
+ status_ = aoclsparse_create_mat_descr(&A_description_);
+ if (status_ != aoclsparse_status_success) {
+ std::cerr << "aoclsparse_create_mat_descr failing for A" << std::endl;
+ printAOCLError(status_);
+ }
+ }
+
+protected:
+ void toSparseFormat() override {
+ A_vals_ = (T*)calloc(nnz_aocl_, sizeof(T));
+ A_cols_ = (aoclsparse_int*)calloc(nnz_aocl_, sizeof(aoclsparse_int));
+ A_rows_ = (aoclsparse_int*)calloc(m_ + 1, sizeof(aoclsparse_int));
+ if (type_ == matrixType::rmat) {
+ rMatCSR(A_vals_, A_cols_, A_rows_, m_, n_, nnz_);
+ } else if (type_ == matrixType::random) {
+ randomCSR(A_vals_, A_cols_, A_rows_, m_, n_, nnz_);
+ } else if (type_ == matrixType::finiteElements) {
+ finiteElementCSR(A_vals_, A_cols_, A_rows_, m_, n_, nnz_);
+ } else {
+ std::cerr << "Matrix type not supported" << std::endl;
+ exit(1);
+ }
+
+
+ // Move into the AOCL CSR matrix handle
+ if constexpr (std::is_same_v) {
+ status_ = aoclsparse_create_scsr(&A_aocl_,
+ base_,
+ m_aocl_,
+ n_aocl_,
+ nnz_aocl_,
+ A_rows_,
+ A_cols_,
+ A_vals_);
+ } else if constexpr (std::is_same_v) {
+ status_ = aoclsparse_create_dcsr(&A_aocl_,
+ base_,
+ m_aocl_,
+ n_aocl_,
+ nnz_aocl_,
+ A_rows_,
+ A_cols_,
+ A_vals_);
+ }
+ if (status_ != aoclsparse_status_success) {
+ std::cerr << "aoclsparse_create_?csr is failing with problem size of " << m_ << "x" << n_ << " . " << n_ << "x" << n_ << std::endl;
+ printAOCLError(status_);
+ } else if (print_) {
+ std::cout << "aoclsparse_create_?csr success" << std::endl;
+ }
+ }
+
+private:
+ void preLoopRequirements() override {
+ status_ = aoclsparse_set_mv_hint(A_aocl_,
+ operation_,
+ A_description_,
+ 5); // Currently hard coded iternation count
+ if (status_ != aoclsparse_status_success) {
+ std::cerr << "aoclsparse_set_mv_hint failing" << std::endl;
+ printAOCLError(status_);
+ } else if (print_) {
+ std::cout << "aoclsparse_set_mv_hint success" << std::endl;
+ }
+
+ status_ = aoclsparse_optimize(A_aocl_);
+ if (status_ != aoclsparse_status_success) {
+ std::cerr << "aoclsparse_optimize failing" << std::endl;
+ printAOCLError(status_);
+ } else if (print_) {
+ std::cout << "aoclsparse_optimize success" << std::endl;
+ }
+ }
+
+ void callSpMDnV() override {
+ if constexpr (std::is_same_v) {
+ status_ = aoclsparse_smv(operation_,
+ &alpha,
+ A_aocl_,
+ A_description_,
+ x_,
+ &beta,
+ y_);
+ } else if constexpr (std::is_same_v) {
+ status_ = aoclsparse_dmv(operation_,
+ &alpha,
+ A_aocl_,
+ A_description_,
+ x_,
+ &beta,
+ y_);
+ }
+ if (status_ != aoclsparse_status_success) {
+ std::cerr << "aoclsparse_?mv failing" << std::endl;
+ printAOCLError(status_);
+ } else if (print_) {
+ std::cout << "aoclsparse_?mv success" << std::endl;
+ }
+ }
+
+ void postLoopRequirements() override {
+ if (debug) {
+ std::cout << "========== CPU ==========" << std::endl;
+ std::cout << "___________________________________________" << std::endl;
+ std::cout << "x =" << std::endl;
+ std::cout << "[";
+ for (int64_t i = 0; i < n_; i++) {
+ std::cout << x_[i];
+ if (i < (n_ - 1)) std::cout << ", ";
+ }
+ std::cout << "]" << std::endl;
+
+ std::cout << "y =" << std::endl;
+ std::cout << "[";
+ for (int64_t i = 0; i < m_; i++) {
+ std::cout << y_[i];
+ if (i < (m_ - 1)) std::cout << ", ";
+ }
+ std::cout << "]" << std::endl;
+ std::cout << "___________________________________________" << std::endl;
+ }
+ }
+
+ void postCallKernelCleanup() override {
+ status_ = aoclsparse_destroy_mat_descr(A_description_);
+ if (status_ != aoclsparse_status_success) {
+ std::cerr << "aoclsparse_destroy_mat_descr failing" << std::endl;
+ printAOCLError(status_);
+ } else if (print_) {
+ std::cout << "aoclsparse_destroy_mat_descr success" << std::endl;
+ }
+
+ status_ = aoclsparse_destroy(&A_aocl_);
+ if (status_ != aoclsparse_status_success) {
+ std::cerr << "aoclsparse_destroy failing" << std::endl;
+ printAOCLError(status_);
+ } else if (print_) {
+ std::cout << "aoclsparse_destroy success" << std::endl;
+ }
+
+ delete[] A_vals_;
+ delete[] A_cols_;
+ delete[] A_rows_;
+ delete[] x_;
+ delete[] y_;
+ }
+
+ void printAOCLError(aoclsparse_status stat) {
+ switch (stat) {
+ case aoclsparse_status_success:
+ std::cerr << "SUCCESS - The operation completed successfully";
+ break;
+ case aoclsparse_status_not_implemented:
+ std::cerr << "NOT_IMPLEMENTED - The requested functionality is not yet implemented in this version";
+ break;
+ case aoclsparse_status_invalid_pointer:
+ std::cerr << "INVALID_POINTER - One or more pointer parameters are NULL or otherwise invalid";
+ break;
+ case aoclsparse_status_invalid_size:
+ std::cerr << "INVALID_SIZE - One or more size parameters (m, n, nnz, etc.) contain an invalid value (e.g., negative or zero where positive required)";
+ break;
+ case aoclsparse_status_internal_error:
+ std::cerr << "INTERNAL_ERROR - Internal library failure";
+ break;
+ case aoclsparse_status_invalid_value:
+ std::cerr << "INVALID_VALUE - Input parameters contain an invalid value (e.g., invalid enum value, base index neither 0 nor 1)";
+ break;
+ case aoclsparse_status_invalid_index_value:
+ std::cerr << "INVALID_INDEX_VALUE - At least one index value is invalid (e.g., negative or out of bounds)";
+ break;
+ case aoclsparse_status_maxit:
+ std::cerr << "MAXIT - function stopped after reaching number of iteration limit";
+ break;
+ case aoclsparse_status_user_stop:
+ std::cerr << "USER_STOP - user requested termination";
+ break;
+ case aoclsparse_status_wrong_type:
+ std::cerr << "WRONG_TYPE - Data type mismatch (e.g., matrix datatypes don't match between operations)";
+ break;
+ case aoclsparse_status_memory_error:
+ std::cerr << "MEMORY_ERROR - memory allocation failure";
+ break;
+ case aoclsparse_status_numerical_error:
+ std::cerr << "NUMERICAL_ERROR - numerical error, e.g., matrix is not positive definite, devide-by-zero error";
+ break;
+ case aoclsparse_status_invalid_operation:
+ std::cerr << "INVALID_OPERATION - cannot proceed with the request at this point";
+ break;
+ case aoclsparse_status_unsorted_input:
+ std::cerr << "UNSORTED_INPUT - the input matrices are not sorted";
+ break;
+ case aoclsparse_status::aoclsparse_status_invalid_kid:
+ std::cerr << "INVALID_KID - user requested kernel id was not available";
+ break;
+ default:
+ std::cerr << "UNKNOWN_STATUS - Unrecognized status code (" + std::to_string(stat) + ")";
+ break;
+ }
+ std::cerr << std::endl;
+ exit(1);
+ }
+
+ bool print_ = false;
+ bool debug = false;
+
+ aoclsparse_status status_;
+
+ aoclsparse_operation operation_;
+ aoclsparse_index_base base_;
+
+ aoclsparse_matrix A_aocl_;
+ aoclsparse_int* A_rows_;
+ aoclsparse_int* A_cols_;
+ T* A_vals_;
+ aoclsparse_int m_aocl_;
+ aoclsparse_int n_aocl_;
+ aoclsparse_int nnz_aocl_;
+
+ aoclsparse_mat_descr A_description_;
+
+ const T alpha = ALPHA;
+ const T beta = BETA;
+};
+}
+
+
+#endif
diff --git a/AOCL/spmspm.hh b/AOCL/spmspm.hh
new file mode 100644
index 0000000..38c5c3f
--- /dev/null
+++ b/AOCL/spmspm.hh
@@ -0,0 +1,375 @@
+#pragma once
+
+#ifdef CPU_AOCL
+#include "aoclsparse.h"
+
+#include
+#include
+#include
+
+#include "../include/kernels/CPU/spmspm.hh"
+#include "../include/utilities.hh"
+
+namespace cpu {
+template
+class spmspm_cpu : public spmspm {
+public:
+ using spmspm::spmspm;
+ using spmspm::callConsume;
+ using spmspm::initInputMatrices;
+ using spmspm::m_;
+ using spmspm::n_;
+ using spmspm::k_;
+ using spmspm::sparsity_;
+ using spmspm::type_;
+ using spmspm::A_nnz_;
+ using spmspm::B_nnz_;
+ using spmspm::iterations_;
+ using spmspm::C_rows_;
+ using spmspm::C_cols_;
+ using spmspm::C_vals_;
+ using spmspm::C_nnz_;
+
+ void initialise(int m, int n, int k, double sparsity, matrixType type,
+ bool binary = false) {
+ sparsity_ = sparsity;
+ type_ = type;
+
+ m_aocl_ = m_ = m;
+ n_aocl_ = n_ = n;
+ k_aocl_ = k_ = k;
+
+
+ uint64_t total_elements_A = (uint64_t)m_ * (uint64_t)k_;
+ uint64_t total_elements_B = (uint64_t)k_ * (uint64_t)n_;
+ nnzA_aocl_ = A_nnz_ = 1 + (uint64_t)((double)total_elements_A * (1.0 - sparsity));
+ nnzB_aocl_ = B_nnz_ = 1 + (uint64_t)((double)total_elements_B * (1.0 - sparsity));
+ C_allocated = false;
+
+ base_ = aoclsparse_index_base_zero;
+ operationA_ = aoclsparse_operation_none;
+ operationB_ = aoclsparse_operation_none;
+
+ status_ = aoclsparse_create_mat_descr(&A_description_);
+ if (status_ != aoclsparse_status_success) {
+ printAOCLError(status_);
+ }
+ status_ = aoclsparse_create_mat_descr(&B_description_);
+ if (status_ != aoclsparse_status_success) {
+ printAOCLError(status_);
+ }
+ initInputMatrices();
+ }
+
+protected:
+ void toSparseFormat() override {
+ A_rows_ = (aoclsparse_int*)calloc(m_ + 1, sizeof(aoclsparse_int));
+ A_cols_ = (aoclsparse_int*)calloc(nnzA_aocl_, sizeof(aoclsparse_int));
+ A_vals_ = (T*)calloc(nnzA_aocl_, sizeof(T));
+ if (A_rows_ == nullptr || A_cols_ == nullptr || A_vals_ == nullptr) {
+ std::cerr << "Failed to allocate memory for A CSR arrays with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl;
+ exit(1);
+ }
+
+ // Initialise datastructures for the CSR format
+ B_rows_ = (aoclsparse_int*)calloc(k_ + 1, sizeof(aoclsparse_int));
+ B_cols_ = (aoclsparse_int*)calloc(nnzB_aocl_, sizeof(aoclsparse_int));
+ B_vals_ = (T*)calloc(nnzB_aocl_, sizeof(T));
+ if (B_rows_ == nullptr || B_cols_ == nullptr || B_vals_ == nullptr) {
+ std::cerr << "Failed to allocate memory for B CSR arrays with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl;
+ exit(1);
+ }
+
+ int seedOffset = 0;
+ do {
+ if (type_ == matrixType::rmat) {
+ rMatCSR(A_vals_, A_cols_, A_rows_, m_, k_, A_nnz_, SEED + seedOffset++);
+ rMatCSR(B_vals_, B_cols_, B_rows_, k_, n_, B_nnz_, SEED + seedOffset++);
+ } else if (type_ == matrixType::random) {
+ randomCSR(A_vals_, A_cols_, A_rows_, m_, k_, A_nnz_, SEED + seedOffset++);
+ randomCSR(B_vals_, B_cols_, B_rows_, k_, n_, B_nnz_, SEED + seedOffset++);
+ } else if (type_ == matrixType::finiteElements) {
+ finiteElementCSR(A_vals_, A_cols_, A_rows_, m_, k_, A_nnz_, SEED + seedOffset++);
+ finiteElementCSR(B_vals_, B_cols_, B_rows_, k_, n_, B_nnz_, SEED + seedOffset++);
+ } else {
+ std::cerr << "Matrix type not supported" << std::endl;
+ exit(1);
+ }
+ } while (calcCNNZ(m_, A_nnz_, A_rows_, A_cols_, k_, B_nnz_, B_rows_, B_cols_) == 0);
+
+ // Move into the AOCL CSR matrix handle
+ if constexpr (std::is_same_v) {
+ status_ = aoclsparse_create_scsr(&A_aocl_,
+ base_,
+ m_aocl_,
+ k_aocl_,
+ nnzA_aocl_,
+ A_rows_,
+ A_cols_,
+ A_vals_);
+ } else if constexpr (std::is_same_v) {
+ status_ = aoclsparse_create_dcsr(&A_aocl_,
+ base_,
+ m_aocl_,
+ k_aocl_,
+ nnzA_aocl_,
+ A_rows_,
+ A_cols_,
+ A_vals_);
+ }
+ if (status_ != aoclsparse_status_success) {
+ std::cerr << "aoclsparse_create_?csr for A is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl;
+ printAOCLError(status_);
+ }
+
+ // Now sort the matrix -- needed for this AOCL function
+ status_ = aoclsparse_order_mat(A_aocl_);
+ if (status_ != aoclsparse_status_success) {
+ std::cerr << "aoclsparse_order_mat for A is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl;
+ printAOCLError(status_);
+ }
+
+ // Move into the AOCL CSR matrix handle
+ if constexpr (std::is_same_v) {
+ status_ = aoclsparse_create_scsr(&B_aocl_,
+ base_,
+ k_aocl_,
+ n_aocl_,
+ nnzB_aocl_,
+ B_rows_,
+ B_cols_,
+ B_vals_);
+ } else if constexpr (std::is_same_v) {
+ status_ = aoclsparse_create_dcsr(&B_aocl_,
+ base_,
+ k_aocl_,
+ n_aocl_,
+ nnzB_aocl_,
+ B_rows_,
+ B_cols_,
+ B_vals_);
+ }
+ if (status_ != aoclsparse_status_success) {
+ std::cerr << "aoclsparse_create_?csr for B is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl;
+ printAOCLError(status_);
+ }
+
+ // Now sort the matrix -- needed for this AOCL function
+ status_ = aoclsparse_order_mat(B_aocl_);
+ if (status_ != aoclsparse_status_success) {
+ std::cerr << "aoclsparse_order_mat for B is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl;
+ printAOCLError(status_);
+ }
+ }
+
+private:
+ void preLoopRequirements() override {
+ }
+
+ void callSpmspm() override {
+ if (C_allocated) {
+ if (C_vals_ != nullptr) {
+ free(C_vals_);
+ C_vals_ = nullptr;
+ }
+ if (C_cols_aocl_ != nullptr) {
+ free(C_cols_aocl_);
+ C_cols_aocl_ = nullptr;
+ }
+ if (C_rows_aocl_ != nullptr) {
+ free(C_rows_aocl_);
+ C_rows_aocl_ = nullptr;
+ }
+ C_allocated = false;
+ }
+
+ request_ = aoclsparse_stage_nnz_count;
+ status_ = aoclsparse_sp2m(operationA_,
+ A_description_,
+ A_aocl_,
+ operationB_,
+ B_description_,
+ B_aocl_,
+ request_,
+ &C_aocl_);
+ if (status_ != aoclsparse_status_success) {
+ std::cerr << "aoclsparse_sp2m failing with request = aoclsparse_stage_nnz_count" << std::endl;
+ printAOCLError(status_);
+ }
+
+ request_ = aoclsparse_stage_finalize;
+ status_ = aoclsparse_sp2m(operationA_,
+ A_description_,
+ A_aocl_,
+ operationB_,
+ B_description_,
+ B_aocl_,
+ request_,
+ &C_aocl_);
+ if (status_ != aoclsparse_status_success) {
+ std::cerr << "aoclsparse_sp2m failing with request = aoclsparse_stage_finalize" << std::endl;
+ printAOCLError(status_);
+ }
+
+ if constexpr (std::is_same_v) {
+ status_ = aoclsparse_export_scsr(C_aocl_,
+ &base_,
+ &C_M,
+ &C_N,
+ &nnzC_aocl_,
+ &C_rows_aocl_,
+ &C_cols_aocl_,
+ &C_vals_);
+ } else if constexpr (std::is_same_v) {
+ status_ = aoclsparse_export_dcsr(C_aocl_,
+ &base_,
+ &C_M,
+ &C_N,
+ &nnzC_aocl_,
+ &C_rows_aocl_,
+ &C_cols_aocl_,
+ &C_vals_);
+ }
+ if (status_ != aoclsparse_status_success) {
+ std::cerr << "aoclsparse_export_zcsr failing" << std::endl;
+ printAOCLError(status_);
+ }
+ C_allocated = true;
+ }
+
+ void postLoopRequirements() override {
+ C_nnz_ = nnzC_aocl_; // Needed for checksum
+ }
+
+ void postCallKernelCleanup() override {
+ status_ = aoclsparse_destroy_mat_descr(A_description_);
+ if (status_ != aoclsparse_status_success) {
+ std::cerr << "aoclsparse_destroy_mat_descr failing for A_description_" << std::endl;
+ printAOCLError(status_);
+ }
+ status_ = aoclsparse_destroy_mat_descr(B_description_);
+ if (status_ != aoclsparse_status_success) {
+ std::cerr << "aoclsparse_destroy_mat_descr failing for B_description_" << std::endl;
+ printAOCLError(status_);
+ }
+
+ status_ = aoclsparse_destroy(&A_aocl_);
+ if (status_ != aoclsparse_status_success) {
+ std::cerr << "aoclsparse_destroy failing for A" << std::endl;
+ printAOCLError(status_);
+ }
+ status_ = aoclsparse_destroy(&B_aocl_);
+ if (status_ != aoclsparse_status_success) {
+ std::cerr << "aoclsparse_destroy failing for B" << std::endl;
+ printAOCLError(status_);
+ }
+
+ status_ = aoclsparse_destroy(&C_aocl_);
+ if (status_ != aoclsparse_status_success) {
+ std::cerr << "aoclsparse_destroy failing for C" << std::endl;
+ printAOCLError(status_);
+ }
+ free(A_rows_);
+ free(A_cols_);
+ free(A_vals_);
+ free(B_rows_);
+ free(B_cols_);
+ free(B_vals_);
+ }
+
+ void printAOCLError(aoclsparse_status stat) {
+ switch (stat) {
+ case aoclsparse_status_success:
+ std::cerr << "SUCCESS - The operation completed successfully";
+ break;
+ case aoclsparse_status_not_implemented:
+ std::cerr << "NOT_IMPLEMENTED - The requested functionality is not yet implemented in this version";
+ break;
+ case aoclsparse_status_invalid_pointer:
+ std::cerr << "INVALID_POINTER - One or more pointer parameters are NULL or otherwise invalid";
+ break;
+ case aoclsparse_status_invalid_size:
+ std::cerr << "INVALID_SIZE - One or more size parameters (m, n, nnz, etc.) contain an invalid value (e.g., negative or zero where positive required)";
+ break;
+ case aoclsparse_status_internal_error:
+ std::cerr << "INTERNAL_ERROR - Internal library failure";
+ break;
+ case aoclsparse_status_invalid_value:
+ std::cerr << "INVALID_VALUE - Input parameters contain an invalid value (e.g., invalid enum value, base index neither 0 nor 1)";
+ break;
+ case aoclsparse_status_invalid_index_value:
+ std::cerr << "INVALID_INDEX_VALUE - At least one index value is invalid (e.g., negative or out of bounds)";
+ break;
+ case aoclsparse_status_maxit:
+ std::cerr << "MAXIT - function stopped after reaching number of iteration limit";
+ break;
+ case aoclsparse_status_user_stop:
+ std::cerr << "USER_STOP - user requested termination";
+ break;
+ case aoclsparse_status_wrong_type:
+ std::cerr << "WRONG_TYPE - Data type mismatch (e.g., matrix datatypes don't match between operations)";
+ break;
+ case aoclsparse_status_memory_error:
+ std::cerr << "MEMORY_ERROR - memory allocation failure";
+ break;
+ case aoclsparse_status_numerical_error:
+ std::cerr << "NUMERICAL_ERROR - numerical error, e.g., matrix is not positive definite, devide-by-zero error";
+ break;
+ case aoclsparse_status_invalid_operation:
+ std::cerr << "INVALID_OPERATION - cannot proceed with the request at this point";
+ break;
+ case aoclsparse_status_unsorted_input:
+ std::cerr << "UNSORTED_INPUT - the input matrices are not sorted";
+ break;
+ case aoclsparse_status::aoclsparse_status_invalid_kid:
+ std::cerr << "INVALID_KID - user requested kernel id was not available";
+ break;
+ default:
+ std::cerr << "UNKNOWN_STATUS - Unrecognized status code (" + std::to_string(stat) + ")";
+ break;
+ }
+ std::cerr << std::endl;
+ exit(1);
+ }
+
+ aoclsparse_status status_;
+
+ aoclsparse_operation operationA_;
+ aoclsparse_operation operationB_;
+ aoclsparse_index_base base_;
+ aoclsparse_request request_;
+
+ aoclsparse_matrix A_aocl_;
+ aoclsparse_int* A_rows_ = nullptr;
+ aoclsparse_int* A_cols_ = nullptr;
+ T* A_vals_ = nullptr;
+
+ aoclsparse_matrix B_aocl_;
+ aoclsparse_int* B_rows_ = nullptr;
+ aoclsparse_int* B_cols_ = nullptr;
+ T* B_vals_ = nullptr;
+
+ aoclsparse_matrix C_aocl_;
+ aoclsparse_int* C_rows_aocl_ = nullptr;
+ aoclsparse_int* C_cols_aocl_ = nullptr;
+ bool C_allocated = false;
+
+ aoclsparse_int m_aocl_;
+ aoclsparse_int n_aocl_;
+ aoclsparse_int k_aocl_;
+ aoclsparse_int nnzA_aocl_;
+ aoclsparse_int nnzB_aocl_;
+ aoclsparse_int nnzC_aocl_;
+
+ aoclsparse_int C_M, C_N;
+
+ aoclsparse_mat_descr A_description_;
+ aoclsparse_mat_descr B_description_;
+
+ const T alpha = ALPHA;
+ const T beta = BETA;
+};
+}
+
+
+#endif
diff --git a/ArmPL/gemm.hh b/ArmPL/gemm.hh
index af7f428..10903d8 100644
--- a/ArmPL/gemm.hh
+++ b/ArmPL/gemm.hh
@@ -1,7 +1,7 @@
#pragma once
#ifdef CPU_ARMPL
-#include
+#include "armpl.h"
#include
#include
@@ -36,8 +36,7 @@ class gemm_cpu : public gemm {
std::max(1, m_));
} else {
// Un-specialised class will not do any work - print error and exit.
- std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported."
- << std::endl;
+ std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported." << std::endl;
exit(1);
}
// Ensure compiler doesn't optimise away the work being done
diff --git a/ArmPL/gemv.hh b/ArmPL/gemv.hh
index cc0e9bf..c568c99 100644
--- a/ArmPL/gemv.hh
+++ b/ArmPL/gemv.hh
@@ -1,7 +1,7 @@
#pragma once
#ifdef CPU_ARMPL
-#include
+#include "armpl.h"
#include
#include
@@ -34,8 +34,7 @@ class gemv_cpu : public gemv {
std::max(1, m_), x_, vecIncrement_, beta, y_, vecIncrement_);
} else {
// Un-specialised class will not do any work - print error and exit.
- std::cout << "ERROR - Datatype for ArmPL CPU GEMV kernel not supported."
- << std::endl;
+ std::cout << "ERROR - Datatype for ArmPL CPU GEMV kernel not supported." << std::endl;
exit(1);
}
// Ensure compiler doesn't optimise away the work being done
diff --git a/ArmPL/spmdnm.hh b/ArmPL/spmdnm.hh
new file mode 100644
index 0000000..4f53c10
--- /dev/null
+++ b/ArmPL/spmdnm.hh
@@ -0,0 +1,43 @@
+#pragma once
+
+#ifdef CPU_ARMPL
+
+#include "../include/kernels/CPU/spmdnm.hh"
+#include "../include/utilities.hh"
+
+namespace cpu {
+template
+class spmdnm_cpu : public spmdnm {
+public:
+ using spmdnm::spmdnm;
+ using spmdnm::callConsume;
+ using spmdnm::initInputMatrices;
+ using spmdnm::m_;
+ using spmdnm::n_;
+ using spmdnm::k_;
+ using spmdnm::B_;
+ using spmdnm::C_;
+ using spmdnm::sparsity_;
+ using spmdnm::type_;
+ using spmdnm::nnz_;
+ using spmdnm::iterations_;
+
+ void initialise(int m, int n, int k, double sparsity,
+ matrixType type, bool binary = false) {}
+
+protected:
+ void toSparseFormat() override {}
+
+private:
+ void preLoopRequirements() override {}
+
+ void callSpmdnm() override {}
+
+ void postLoopRequirements() override {}
+
+ void postCallKernelCleanup() override {}
+};
+}
+
+
+#endif
diff --git a/ArmPL/spmdnv.hh b/ArmPL/spmdnv.hh
new file mode 100644
index 0000000..7b0cf93
--- /dev/null
+++ b/ArmPL/spmdnv.hh
@@ -0,0 +1,205 @@
+#pragma once
+
+#ifdef CPU_ARMPL
+#include
+#include
+#include "armpl.h"
+#include
+
+#include
+
+#include "../include/kernels/CPU/spmdnv.hh"
+#include "../include/utilities.hh"
+
+namespace cpu {
+/** A class for GEMM CPU BLAS kernels. */
+template
+class spmdnv_cpu : public spmdnv {
+public:
+ using spmdnv::spmdnv;
+ using spmdnv::callConsume;
+ using spmdnv::initInputMatrixVector;
+ using spmdnv::m_;
+ using spmdnv::n_;
+ using spmdnv::x_;
+ using spmdnv::y_;
+ using spmdnv::sparsity_;
+ using spmdnv::type_;
+ using spmdnv::nnz_;
+ using spmdnv::iterations_;
+
+ /** Initialise the required data structures. */
+ void initialise(int m, int n, double sparsity, matrixType type,
+ bool binary = false) {
+ m_armpl_ = m_ = m;
+ n_armpl_ = n_ = n;
+ sparsity_ = sparsity;
+ type_ = type;
+
+ nnz_armpl_ = nnz_ = 1 + (uint64_t)((double)m_ * (double)n_ * (1.0 - sparsity_));
+
+ x_ = (T*)calloc(n_, sizeof(T));
+ y_ = (T*)calloc(m_, sizeof(T));
+
+ // Initialise the matrix and vectors
+ initInputMatrixVector();
+ }
+
+protected:
+ void toSparseFormat() override {
+ // Make arrays for A
+ A_vals_ = (T*)calloc(nnz_armpl_, sizeof(T));
+ A_cols_ = (armpl_int_t*)calloc(nnz_armpl_, sizeof(armpl_int_t));
+ A_rows_ = (armpl_int_t*)calloc(m_ + 1, sizeof(armpl_int_t));
+
+ // Fill the CSR arrays
+ if (type_ == matrixType::rmat) {
+ rMatCSR(A_vals_, A_cols_, A_rows_, m_armpl_, n_armpl_, nnz_);
+ } else if (type_ == matrixType::random) {
+ randomCSR(A_vals_, A_cols_, A_rows_, m_armpl_, n_armpl_, nnz_);
+ } else if (type_ == matrixType::finiteElements) {
+ finiteElementCSR(A_vals_, A_cols_, A_rows_, m_armpl_, n_armpl_, nnz_);
+ } else {
+ std::cerr << "Matrix type not supported" << std::endl;
+ exit(1);
+ }
+
+ // Create the armpl object for this sparse matrix
+ if constexpr (std::is_same_v) {
+ status_ = armpl_spmat_create_csr_s(&A_armpl_,
+ m_armpl_,
+ n_armpl_,
+ A_rows_,
+ A_cols_,
+ A_vals_,
+ 0);
+ } else if constexpr (std::is_same_v) {
+ status_ = armpl_spmat_create_csr_d(&A_armpl_,
+ m_armpl_,
+ n_armpl_,
+ A_rows_,
+ A_cols_,
+ A_vals_,
+ 0);
+ } else {
+ // Un-specialised class will not do any work - print error and exit.
+ std::cerr << "ERROR - Datatype for ArmPL CPU SpMDnV kernel not supported." << std::endl;
+ exit(1);
+ }
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cerr << "ERROR " << status_ << std::endl;
+ exit(1);
+ }
+ }
+
+private:/** Perform any required steps before calling the SpMDnV kernel that should
+ * be timed. */
+ void preLoopRequirements() override {
+ // Give the library some hints so it can optimise the performance of the kernel
+ status_ = armpl_spmat_hint(A_armpl_,
+ ARMPL_SPARSE_HINT_MEMORY,
+ ARMPL_SPARSE_MEMORY_NOALLOCS);
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cerr << "ERROR " << status_ << std::endl;
+ exit(1);
+ }
+
+ status_ = armpl_spmat_hint(A_armpl_,
+ ARMPL_SPARSE_HINT_STRUCTURE,
+ ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED);
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cerr << "ERROR " << status_ << std::endl;
+ exit(1);
+ }
+
+ status_ = armpl_spmat_hint(A_armpl_,
+ ARMPL_SPARSE_HINT_SPMV_INVOCATIONS,
+ ARMPL_SPARSE_INVOCATIONS_FEW);
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cerr << "ERROR " << status_ << std::endl;
+ exit(1);
+ }
+
+ status_ = armpl_spmat_hint(A_armpl_,
+ ARMPL_SPARSE_HINT_SPMV_OPERATION,
+ ARMPL_SPARSE_OPERATION_NOTRANS);
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cerr << "ERROR " << status_ << std::endl;
+ exit(1);
+ }
+
+ // Now optimise the matrix for SpMV based on the hints given
+ status_ = armpl_spmv_optimize(A_armpl_);
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cerr << "ERROR " << status_ << std::endl;
+ exit(1);
+ }
+ }
+
+ /** Make call to the SpMDnV kernel. */
+ void callSpMDnV() override {
+ if constexpr (std::is_same_v) {
+ status_ = armpl_spmv_exec_s(ARMPL_SPARSE_OPERATION_NOTRANS,
+ alpha,
+ A_armpl_,
+ x_,
+ beta,
+ y_);
+ } else if constexpr (std::is_same_v) {
+ status_ = armpl_spmv_exec_d(ARMPL_SPARSE_OPERATION_NOTRANS,
+ alpha,
+ A_armpl_,
+ x_,
+ beta,
+ y_);
+ } else {
+ // Un-specialised class will not do any work - print error and exit.
+ std::cerr << "ERROR - Datatype for ArmPL CPU GEMV kernel not supported." << std::endl;
+ exit(1);
+ }
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cerr << "ERROR: " << status_ << std::endl;
+ exit(1);
+ }
+
+ // Ensure compiler doesn't optimise away the work being done
+ callConsume();
+ }
+
+
+
+ /** Perform any required steps after calling the SpMDnV kernel that should
+ * be timed. */
+ void postLoopRequirements() override {}
+
+ void postCallKernelCleanup() override {
+ status_ = armpl_spmat_destroy(A_armpl_);
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cerr << "ERROR " << status_ << std::endl;
+ exit(1);
+ }
+
+ free(A_rows_);
+ free(A_cols_);
+ free(A_vals_);
+ free(x_);
+ free(y_);
+ }
+
+ armpl_status_t status_;
+
+ armpl_int_t n_armpl_;
+ armpl_int_t m_armpl_;
+ armpl_int_t nnz_armpl_;
+
+ T* A_vals_;
+ armpl_int_t* A_rows_;
+ armpl_int_t* A_cols_;
+
+ armpl_spmat_t A_armpl_;
+
+ const T alpha = ALPHA;
+ const T beta = BETA;
+};
+} // namespace cpu
+#endif
\ No newline at end of file
diff --git a/ArmPL/spmspm.hh b/ArmPL/spmspm.hh
new file mode 100644
index 0000000..bb17392
--- /dev/null
+++ b/ArmPL/spmspm.hh
@@ -0,0 +1,364 @@
+#pragma once
+
+#ifdef CPU_ARMPL
+#include
+#include "armpl.h"
+#include
+
+#include
+#include
+#include
+#include
+
+#include "../include/kernels/CPU/spmspm.hh"
+#include "../include/utilities.hh"
+
+namespace cpu {
+/** A class for sparse matrix-sparse matrix CPU BLAS kernels. */
+template
+class spmspm_cpu : public spmspm {
+public:
+ using spmspm::spmspm;
+ using spmspm::callConsume;
+ using spmspm::initInputMatrices;
+ using spmspm::m_;
+ using spmspm::n_;
+ using spmspm::k_;
+ using spmspm::sparsity_;
+ using spmspm::type_;
+ using spmspm::A_nnz_;
+ using spmspm::B_nnz_;
+ using spmspm::iterations_;
+ using spmspm::C_vals_;
+ using spmspm::C_nnz_;
+
+ void initialise(int m, int n, int k, double sparsity, matrixType type,
+ bool binary = false) {
+ sparsity_ = sparsity;
+ type_ = type;
+
+ m_armpl_ = m_ = m;
+ n_armpl_ = n_ = n;
+ k_armpl_ = k_ = k;
+
+
+ uint64_t total_elements_A = (uint64_t)m_ * (uint64_t)k_;
+ uint64_t total_elements_B = (uint64_t)k_ * (uint64_t)n_;
+ nnzA_armpl_ = A_nnz_ = 1 + (uint64_t)((double)total_elements_A * (1.0 - sparsity));
+ nnzB_armpl_ = B_nnz_ = 1 + (uint64_t)((double)total_elements_B * (1.0 - sparsity));
+
+ initInputMatrices();
+ }
+
+protected:
+ void toSparseFormat() override {
+ A_vals_ = (T*)calloc(nnzA_armpl_, sizeof(T));
+ A_cols_ = (armpl_int_t*)calloc(nnzA_armpl_, sizeof(armpl_int_t));
+ A_rows_ = (armpl_int_t*)calloc(m_armpl_ + 1, sizeof(armpl_int_t));
+
+ B_vals_ = (T*)calloc(nnzB_armpl_, sizeof(T));
+ B_cols_ = (armpl_int_t*)calloc(nnzB_armpl_, sizeof(armpl_int_t));
+ B_rows_ = (armpl_int_t*)calloc(k_armpl_ + 1, sizeof(armpl_int_t));
+
+ int seedOffset = 0;
+ do {
+ if (type_ == matrixType::rmat) {
+ rMatCSR(A_vals_, A_cols_, A_rows_, m_armpl_, k_armpl_, nnzA_armpl_, SEED + seedOffset++);
+ rMatCSR(B_vals_, B_cols_, B_rows_, k_armpl_, n_armpl_, nnzB_armpl_, SEED + seedOffset++);
+ } else if (type_ == matrixType::random) {
+ randomCSR(A_vals_, A_cols_, A_rows_, m_armpl_, k_armpl_, nnzA_armpl_, SEED + seedOffset++);
+ randomCSR(B_vals_, B_cols_, B_rows_, k_armpl_, n_armpl_, nnzB_armpl_, SEED + seedOffset++);
+ } else if (type_ == matrixType::finiteElements) {
+ finiteElementCSR(A_vals_, A_cols_, A_rows_, m_armpl_, k_armpl_, nnzA_armpl_, SEED + seedOffset++);
+ finiteElementCSR(B_vals_, B_cols_, B_rows_, k_armpl_, n_armpl_, nnzB_armpl_, SEED + seedOffset++);
+ } else {
+ std::cerr << "Matrix type not supported" << std::endl;
+ exit(1);
+ }
+ } while (calcCNNZ(m_, A_nnz_, A_rows_, A_cols_, k_, B_nnz_, B_rows_, B_cols_) == 0);
+
+ // Now make the sparse matrix objects
+ if constexpr (std::is_same_v) {
+ status_ = armpl_spmat_create_csr_s(&A_armpl_,
+ m_armpl_,
+ k_armpl_,
+ A_rows_,
+ A_cols_,
+ A_vals_,
+ 0);
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cout << "ERROR " << status_ << std::endl;
+ exit(1);
+ }
+ status_ = armpl_spmat_create_csr_s(&B_armpl_,
+ k_armpl_,
+ n_armpl_,
+ B_rows_,
+ B_cols_,
+ B_vals_,
+ 0);
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cout << "ERROR " << status_ << std::endl;
+ exit(1);
+ }
+ } else if constexpr (std::is_same_v) {
+ status_ = armpl_spmat_create_csr_d(&A_armpl_,
+ m_armpl_,
+ k_armpl_,
+ A_rows_,
+ A_cols_,
+ A_vals_,
+ 0);
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cout << "ERROR " << status_ << std::endl;
+ exit(1);
+ }
+ status_ = armpl_spmat_create_csr_d(&B_armpl_,
+ k_armpl_,
+ n_armpl_,
+ B_rows_,
+ B_cols_,
+ B_vals_,
+ 0);
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cout << "ERROR " << status_ << std::endl;
+ exit(1);
+ }
+ }
+ C_armpl_ = armpl_spmat_create_null(m_armpl_, n_armpl_);
+ }
+
+private:
+ void preLoopRequirements() override {
+ // Populate A and B with hints
+ status_ = armpl_spmat_hint(A_armpl_,
+ ARMPL_SPARSE_HINT_MEMORY,
+ ARMPL_SPARSE_MEMORY_NOALLOCS);
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cout << "ERROR " << status_ << std::endl;
+ exit(1);
+ }
+ status_ = armpl_spmat_hint(A_armpl_,
+ ARMPL_SPARSE_HINT_STRUCTURE,
+ ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED);
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cout << "ERROR " << status_ << std::endl;
+ exit(1);
+ }
+ status_ = armpl_spmat_hint(A_armpl_,
+ ARMPL_SPARSE_HINT_SPMM_INVOCATIONS,
+ ARMPL_SPARSE_INVOCATIONS_FEW);
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cout << "ERROR " << status_ << std::endl;
+ exit(1);
+ }
+ status_ = armpl_spmat_hint(A_armpl_,
+ ARMPL_SPARSE_HINT_SPMM_OPERATION,
+ ARMPL_SPARSE_OPERATION_NOTRANS);
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cout << "ERROR " << status_ << std::endl;
+ exit(1);
+ }
+ status_ = armpl_spmat_hint(A_armpl_,
+ ARMPL_SPARSE_HINT_SPMM_STRATEGY,
+ ARMPL_SPARSE_SPMM_STRAT_OPT_FULL_STRUCT);
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cout << "ERROR " << status_ << std::endl;
+ exit(1);
+ }
+
+ status_ = armpl_spmat_hint(B_armpl_,
+ ARMPL_SPARSE_HINT_MEMORY,
+ ARMPL_SPARSE_MEMORY_NOALLOCS);
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cout << "ERROR " << status_ << std::endl;
+ exit(1);
+ }
+ status_ = armpl_spmat_hint(B_armpl_,
+ ARMPL_SPARSE_HINT_STRUCTURE,
+ ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED);
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cout << "ERROR " << status_ << std::endl;
+ exit(1);
+ }
+ status_ = armpl_spmat_hint(B_armpl_,
+ ARMPL_SPARSE_HINT_SPMM_INVOCATIONS,
+ ARMPL_SPARSE_INVOCATIONS_FEW);
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cout << "ERROR " << status_ << std::endl;
+ exit(1);
+ }
+ status_ = armpl_spmat_hint(B_armpl_,
+ ARMPL_SPARSE_HINT_SPMM_OPERATION,
+ ARMPL_SPARSE_OPERATION_NOTRANS);
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cout << "ERROR " << status_ << std::endl;
+ exit(1);
+ }
+ status_ = armpl_spmat_hint(B_armpl_,
+ ARMPL_SPARSE_HINT_SPMM_STRATEGY,
+ ARMPL_SPARSE_SPMM_STRAT_OPT_FULL_STRUCT);
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cout << "ERROR " << status_ << std::endl;
+ exit(1);
+ }
+ status_ = armpl_spmat_hint(B_armpl_,
+ ARMPL_SPARSE_HINT_SPMM_STRATEGY,
+ ARMPL_SPARSE_SPMM_STRAT_OPT_FULL_STRUCT);
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cout << "ERROR " << status_ << std::endl;
+ exit(1);
+ }
+
+ // Call the optimise function to apply hints
+ status_ = armpl_spmm_optimize(ARMPL_SPARSE_OPERATION_NOTRANS,
+ ARMPL_SPARSE_OPERATION_NOTRANS,
+ ARMPL_SPARSE_SCALAR_ONE,
+ A_armpl_,
+ B_armpl_,
+ ARMPL_SPARSE_SCALAR_ZERO,
+ C_armpl_);
+ }
+
+ void callSpmspm() override{
+ if constexpr (std::is_same_v) {
+ status_ = armpl_spmm_exec_s(ARMPL_SPARSE_OPERATION_NOTRANS,
+ ARMPL_SPARSE_OPERATION_NOTRANS,
+ alpha,
+ A_armpl_,
+ B_armpl_,
+ beta,
+ C_armpl_);
+ } else if constexpr (std::is_same_v) {
+ status_ = armpl_spmm_exec_d(ARMPL_SPARSE_OPERATION_NOTRANS,
+ ARMPL_SPARSE_OPERATION_NOTRANS,
+ alpha,
+ A_armpl_,
+ B_armpl_,
+ beta,
+ C_armpl_);
+ } else {
+ // Un-specialised class will not do any work - print error and exit.
+ std::cout << "ERROR - Datatype for ArmPL CPU SpMSpM kernel not supported." << std::endl;
+ exit(1);
+ }
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cerr << "ERROR: " << status_ << std::endl;
+ exit(1);
+ }
+ }
+
+ void postLoopRequirements() override {
+ // Export the C arrays from the structure
+ if constexpr (std::is_same_v) {
+ status_ = armpl_spmat_export_csr_s(C_armpl_,
+ 0,
+ &m_armpl_,
+ &n_armpl_,
+ &C_rows_,
+ &C_cols_,
+ &C_vals_);
+ } else if constexpr (std::is_same_v) {
+ status_ = armpl_spmat_export_csr_d(C_armpl_,
+ 0,
+ &m_armpl_,
+ &n_armpl_,
+ &C_rows_,
+ &C_cols_,
+ &C_vals_);
+ } else {
+ // Un-specialised class will not do any work - print error and exit.
+ std::cout << "ERROR - Datatype for ArmPL CPU SpMSpM kernel not supported." << std::endl;
+ exit(1);
+ }
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cerr << "ERROR: " << status_ << std::endl;
+ exit(1);
+ }
+ C_nnz_ = nnzC_armpl_ = C_rows_[m_armpl_];
+
+ // ARMPL does not seem to enforce ordered column indices in its
+ // output matrices. Therefore, to allow the checksum to take place
+ // We have to order the output matrix here.
+ for (int i = 0; i < m_; i++) {
+ int start = C_rows_[i];
+ int end = C_rows_[i + 1];
+ int len = end - start;
+ if (len > 1) {
+ std::vector> row_entries(len);
+ for (int j = 0; j < len; j++) {
+ row_entries[j] = {C_cols_[start + j], C_vals_[start + j]};
+ }
+
+ std::sort(row_entries.begin(), row_entries.end(),
+ [](const auto &a, const auto &b) { return a.first < b.first; });
+
+ for (int j = 0; j < len; j++) {
+ C_cols_[start + j] = row_entries[j].first;
+ C_vals_[start + j] = row_entries[j].second;
+ }
+ }
+ }
+ }
+
+ void postCallKernelCleanup() override {
+ status_ = armpl_spmat_destroy(A_armpl_);
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cout << "ERROR " << status_ << std::endl;
+ exit(1);
+ }
+ status_ = armpl_spmat_destroy(B_armpl_);
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cout << "ERROR " << status_ << std::endl;
+ exit(1);
+ }
+ status_ = armpl_spmat_destroy(C_armpl_);
+ if (status_ != ARMPL_STATUS_SUCCESS) {
+ std::cout << "ERROR " << status_ << std::endl;
+ exit(1);
+ }
+
+ free(A_rows_);
+ free(A_cols_);
+ free(A_vals_);
+ free(B_rows_);
+ free(B_cols_);
+ free(B_vals_);
+ free(C_rows_);
+ free(C_cols_);
+ free(C_vals_);
+ }
+
+ const T alpha = ALPHA;
+ const T beta = BETA;
+
+
+ armpl_status_t status_;
+
+ armpl_int_t n_armpl_;
+ armpl_int_t m_armpl_;
+ armpl_int_t k_armpl_;
+ armpl_int_t nnzA_armpl_;
+ armpl_int_t nnzB_armpl_;
+ armpl_int_t nnzC_armpl_;
+
+ armpl_int_t* A_cols_;
+ armpl_int_t* A_rows_;
+ T* A_vals_;
+
+ armpl_int_t* B_rows_;
+ armpl_int_t* B_cols_;
+ T* B_vals_;
+
+ armpl_int_t* C_rows_;
+ armpl_int_t* C_cols_;
+ // No C_vals_ needed as inheriting from
+ // parent in order to allow result check to carry out
+
+ armpl_spmat_t A_armpl_;
+ armpl_spmat_t B_armpl_;
+ armpl_spmat_t C_armpl_;
+
+};
+} // namespace cpu
+#endif
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 5dd2fc5..d4dbcbe 100644
--- a/Makefile
+++ b/Makefile
@@ -51,10 +51,10 @@ CXX = $(CXX_$(COMPILER))
CXXFLAGS_ARM = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native
CXXFLAGS_CLANG = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native
-CXXFLAGS_GNU = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native
-CXXFLAGS_INTEL = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native -Wno-tautological-constant-compare
+CXXFLAGS_GNU = -std=c++17 -Wall -Wno-deprecated-declarations -Ofast -$(ARCHFLAG)=native
+CXXFLAGS_INTEL = -std=c++17 -Wall -O3 -ffast-math -$(ARCHFLAG)=native -Wno-tautological-constant-compare
CXXFLAGS_NVIDIA = -std=c++17 -Wall -O3 -fast -$(ARCHFLAG)=native
-CXXFLAGS_HIP = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native
+CXXFLAGS_HIP = -std=c++17 -Wall -O3 -ffast-math -$(ARCHFLAG)=native
ifndef CXXFLAGS
CXXFLAGS = $(CXXFLAGS_$(COMPILER))
@@ -98,16 +98,16 @@ $(error Must add `MKLROOT=/path/to/mkl/` to make command to use OneMKL CPU Libra
endif
# Add INTEL compiler options
ifeq ($(COMPILER), INTEL)
-override CXXFLAGS += -L$(MKLROOT)/lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm -ldl -qmkl=parallel -DMKL_INT=int
+override CXXFLAGS += -L$(MKLROOT)/lib/intel64 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm -ldl -qmkl=parallel -DMKL_INT=int
# Add GNU compiler options
else ifeq ($(COMPILER), GNU)
-override CXXFLAGS += -m64 -L$(MKLROOT)/lib -Wl,--no-as-needed -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm -ldl -I"${MKLROOT}/include" -DMKL_INT=int
+override CXXFLAGS += -m64 -L$(MKLROOT)/lib/intel64 -Wl,--no-as-needed -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm -ldl -I"${MKLROOT}/include" -DMKL_INT=int
$(warning Users may be required to do the following to use $(COMPILER) with $(CPU_LIB):)
$(info $(TAB)$(TAB)Add `/lib` to `$$LD_LIBRARY_PATH`)
$(info )
# Add CLANG compiler options
else ifeq ($(COMPILER), CLANG)
-override CXXFLAGS += -L$(MKLROOT)/lib -Wl,--no-as-needed -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm -ldl -m64 -I"${MKLROOT}/include" -DMKL_INT=int
+override CXXFLAGS += -L$(MKLROOT)/lib/intel64 -Wl,--no-as-needed -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm -ldl -m64 -I"${MKLROOT}/include" -DMKL_INT=int
$(warning Users may be required to do the following to use $(COMPILER) with $(CPU_LIB):)
$(info $(TAB)$(TAB)Add `/lib` to `$$LD_LIBRARY_PATH`)
$(info )
@@ -118,10 +118,11 @@ endif
HEADER_FILES+= $(wildcard oneMKL/CPU/*.hh)
else ifeq ($(CPU_LIB), AOCL)
+override CXXFLAGS += -laoclutils -lblis -lflame -laoclsparse
ifeq ($(COMPILER), INTEL)
-override CXXFLAGS += -lblis-mt -qopenmp
+override CXXFLAGS += -qopenmp
else
-override CXXFLAGS += -lblis-mt -fopenmp
+override CXXFLAGS += -fopenmp
endif
$(warning Users may be required to do the following to use $(COMPILER) with $(CPU_LIB):)
$(info $(TAB)$(TAB)Add `CXXFLAGS="-L/lib -I/include/blis -Wl,-rpath,/lib"` to make command)
@@ -170,14 +171,14 @@ $(warning GPU_LIB not set (use CUBLAS, ONEMKL, ROCBLAS). No GPU kernels will be
else ifeq ($(GPU_LIB), CUBLAS)
# Do cuBLAS stuff
ifeq ($(COMPILER), NVIDIA)
-override CXXFLAGS += -cudalib=cublas
+override CXXFLAGS += -cudalib=cublas -lcusparse_static
else
$(warning Users may be required to do the following to use $(COMPILER) with $(GPU_LIB):)
$(info $(TAB)$(TAB)Add `CXXFLAGS=-L/.../math_libs/lib64 -L/.../cuda/lib64` to make command)
$(info $(TAB)$(TAB)Add `CXXFLAGS=-I/.../math_libs/include -I/.../cuda/include` to make command)
$(info $(TAB)$(TAB)Add `CXXFLAGS=-Wl,-rpath,/.../math_libs/lib64 -Wl,-rpath,/.../cuda/lib64` to make command)
$(info )
-override CXXFLAGS += -lcublas -lcudart
+override CXXFLAGS += -lcublas -lcudart -lcusparse
endif
HEADER_FILES += $(wildcard cuBLAS/*.hh)
@@ -188,7 +189,7 @@ ifndef MKLROOT
$(error Must add `MKLROOT=/path/to/mkl/` to make command to use OneMKL CPU Library)
endif
# Add compiler and link options
-override CXXFLAGS += -fsycl -L$(MKLROOT)/lib -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -lmkl_core -lsycl -lpthread -lm -ldl -fsycl -DMKL_ILP64 -I"$(MKLROOT)/include"
+override CXXFLAGS += -fsycl -L$(MKLROOT)/lib/intel64 -lmkl_sycl_blas -lmkl_sycl_sparse -lmkl_intel_lp64 -lmkl_tbb_thread -ltbb -lmkl_core -lsycl -lpthread -lm -ldl -fsycl -DMKL_LP64 -I"$(MKLROOT)/include"
# `lmkl_tbb_thread` can replace `lmkl_sequential`
$(warning Users may be required to do the following to use $(COMPILER) with $(GPU_LIB):)
$(info $(TAB)$(TAB)Add `/lib` to `$$LD_LIBRARY_PATH`)
@@ -199,17 +200,17 @@ $(error Selected compiler $(COMPILER) is not currently compatible with oneMKL GP
endif
else ifeq ($(GPU_LIB), ROCBLAS)
-ifeq ($(COMPILER), HIP)
+# ifeq ($(COMPILER), HIP)
# Do rocBLAS stuff
-override CXXFLAGS += -lrocblas -lm -lpthread -D__HIP_PLATFORM_AMD__
+override CXXFLAGS += -lrocblas -lrocsparse -lm -lpthread -D__HIP_PLATFORM_AMD__
$(warning Users may be required to do the following to use $(COMPILER) with $(GPU_LIB):)
$(info $(TAB)$(TAB)Add `CXXFLAGS=-L/lib -L/lib` to make command)
$(info $(TAB)$(TAB)Add `CXXFLAGS=-I/include -I/include` to make command)
$(info $(TAB)$(TAB)Add `CXXFLAGS=-Wl,-rpath,/lib -Wl,-rpath,/lib` to make command)
HEADER_FILES += $(wildcard rocBLAS/*.hh)
-else
-$(error Selected compiler $(COMPILER) is not currently compatible with rocBLAS GPU Library)
-endif
+# else
+# $(error Selected compiler $(COMPILER) is not currently compatible with rocBLAS GPU Library)
+# endif
else
@@ -225,7 +226,7 @@ ifdef GPU_LIB
override CXXFLAGS += -DGPU_$(GPU_LIB)
endif
-LDFLAGS = -lm
+LDFLAGS = -lm
# -------
@@ -233,11 +234,28 @@ EXE = gpu-blob
.PHONY: all $(EXE) clean
-all: $(EXE)
+all: print $(EXE)
+
+print:
+ @echo "COMPILER = $(COMPILER)"
+ @echo "CXX = $(CXX)"
+ @echo "CPU_LIB = $(CPU_LIB)"
+ @echo "GPU_LIB = $(GPU_LIB)"
+ @echo "CXXFLAGS = $(CXXFLAGS)"
+ @echo "LDFLAGS = $(LDFLAGS)"
+ @echo "Full command would be:"
+ @echo "$(CXX) $(SRC_FILES) $(CXXFLAGS) -Lsrc/Consume -Wl,-rpath,src/Consume -lconsume $(LDFLAGS) -o gpu-blob"
+ @echo "â–‘â–‘ â–‘â–‘â–‘ â–‘â–‘â–‘ â–‘â–‘â–‘â–‘ â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ â–‘â–‘â–‘ â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘â–‘ â–‘â–‘â–‘ â–‘â–‘"
+ @echo "â–’ â–’â–’â–’â–’â–’â–’â–’â–’ â–’â–’â–’â–’ â–’â–’ â–’â–’â–’â–’ â–’â–’â–’â–’â–’â–’â–’â–’ â–’â–’â–’â–’ â–’â–’ â–’â–’â–’â–’â–’â–’â–’â–’ â–’â–’â–’â–’ â–’â–’ â–’â–’â–’â–’ â–’"
+ @echo "â–“ â–“â–“â–“ â–“â–“ â–“â–“â–“ â–“â–“â–“â–“ â–“â–“ â–“â–“ â–“â–“â–“ â–“â–“â–“â–“â–“â–“â–“â–“ â–“â–“â–“â–“ â–“â–“ â–“â–“"
+ @echo "█ ████ ██ ████████ ████ ████████ ████ ██ ████████ ████ ██ ████ █"
+ @echo "██ ███ █████████ █████████ ███ ███ ███ ██"
+
$(EXE): src/Consume/consume.c $(SRC_FILES) $(HEADER_FILES)
gcc src/Consume/consume.c -fpic -O0 -shared -o src/Consume/libconsume.so
- $(CXX) $(SRC_FILES) $(CXXFLAGS) -Lsrc/Consume -Wl,-rpath,src/Consume -lconsume $(LDFLAGS) -o $@
+ @echo "Building main executable with $(CXX)"
+ $(CXX) $(SRC_FILES) -o $@ $(CXXFLAGS) -Lsrc/Consume -Wl,-rpath,src/Consume -lconsume $(LDFLAGS)
clean:
- rm -f $(EXE) src/Consume/libconsume.so
\ No newline at end of file
+ rm -f $(EXE) src/Consume/libconsume.so
diff --git a/NVPL/spmdnm.hh b/NVPL/spmdnm.hh
new file mode 100644
index 0000000..1dc2bc0
--- /dev/null
+++ b/NVPL/spmdnm.hh
@@ -0,0 +1,43 @@
+#pragma once
+
+#ifdef CPU_NVPL
+
+#include "../include/kernels/CPU/spmdnm.hh"
+#include "../include/utilities.hh"
+
+namespace cpu {
+template
+class spmdnm_cpu : public spmdnm {
+public:
+ using spmdnm::spmdnm;
+ using spmdnm::callConsume;
+ using spmdnm::initInputMatrices;
+ using spmdnm::m_;
+ using spmdnm::n_;
+ using spmdnm::k_;
+ using spmdnm::B_;
+ using spmdnm::C_;
+ using spmdnm::sparsity_;
+ using spmdnm::type_;
+ using spmdnm::nnz_;
+ using spmdnm::iterations_;
+
+ void initialise(int m, int n, int k, double sparsity,
+ matrixType type, bool binary = false) {}
+
+protected:
+ void toSparseFormat() override {}
+
+private:
+ void preLoopRequirements() override {}
+
+ void callSpmdnm() override {}
+
+ void postLoopRequirements() override {}
+
+ void postCallKernelCleanup() override {}
+};
+}
+
+
+#endif
diff --git a/NVPL/spmdnv.hh b/NVPL/spmdnv.hh
new file mode 100644
index 0000000..e3f4353
--- /dev/null
+++ b/NVPL/spmdnv.hh
@@ -0,0 +1,213 @@
+#pragma once
+
+#ifdef CPU_NVPL
+#include
+
+#include "../include/kernels/CPU/spmdnv.hh"
+#include "../include/utilities.hh"
+
+namespace cpu {
+/** A class for SpMDnV CPU BLAS kernels. */
+template
+class spmdnv_cpu : public spmdnv {
+ public:
+ using spmdnv::spmdnv;
+ using spmdnv::callConsume;
+ using spmdnv::initInputMatrixVector;
+ using spmdnv::m_;
+ using spmdnv::n_;
+ using spmdnv::x_;
+ using spmdnv::y_;
+ using spmdnv::sparsity_;
+ using spmdnv::type_;
+ using spmdnv::nnz_;
+ using spmdnv::iterations_;
+
+ void initialise (int m, int n, double sparsity, matrixType type,
+ bool binary = false) {
+ m_ = m;
+ n_ = n;
+ sparsity_ = sparsity;
+ type_ = type;
+
+ nnz_ = 1 + (uint64_t)((double)m_ * (double)n_ * (1.0 - sparsity_));
+
+ if constexpr (std::is_same_v) {
+ dataType_ = NVPL_SPARSE_R_32F;
+ } else if constexpr (Std::is_same_v) {
+ dataType_ = NVPL_SPARSE_R_64F;
+ } else {
+ throw std::runtime_error("Only float and double are supported for NVPL.");
+ }
+
+ x_ = (T*)calloc(n_, sizeof(T));
+ y_ = (T*)calloc(m_, sizeof(T));
+ z_ = (T*)calloc(m_, sizeof(T));
+
+ initInputMatrixVector();
+ }
+
+protected:
+ void toSparseFormat() override {
+ A_vals_ = (T*)calloc(nnz_, sizeof(T));
+ A_cols_ = (int64_t*)calloc(nnz_, sizeof(int64_t));
+ A_rows_ = (int64_t*)calloc(m_ + 1, sizeof(int64_t));
+
+ // Fill the CSR arrays
+ if (type_ == matrixType::rmat) {
+ rMatCSR(A_vals_, A_cols_, A_rows_, m_, n_, nnz_);
+ } else if (type_ == matrixType::random) {
+ randomCSR(A_vals_, A_cols_, A_rows_, m_, n_, nnz_);
+ } else if (type_ == matrixType::finiteElements) {
+ finiteElementCSR(A_vals_, A_cols_, A_rows_, m_, n_, nnz_);
+ } else {
+ std::cerr << "Matrix type not supported" << std::endl;
+ exit(1);
+ }
+
+ // Make the NVPL descriptors
+ status_ = nvpl_sparse_create_const_csr(&A_descr_,
+ m_,
+ n_,
+ nnz_,
+ A_rows_,
+ A_cols_,
+ A_vals_,
+ indexType_,
+ indexType_,
+ base_,
+ dataType_);
+ if (status_ != NVPL_SPARSE_STATUS_SUCCESS) {
+ std::cerr << "nvpl_sparse_create_csr failed with error: " << status_ << std::endl;
+ exit(1);
+ }
+
+ status_ = nvpl_sparse_create_const_dn_vec(X_descr_,
+ n_,
+ x_,
+ dataType_);
+ if (status_ != NVPL_SPARSE_STATUS_SUCCESS) {
+ std::cerr << "nvpl_sparse_create_const_dn_vec failed with error: " << status_ << std::endl;
+ exit(1);
+ }
+
+ status_ = nvpl_sparse_create_dn_vec(Y_descr_,
+ m_,
+ y_,
+ dataType_);
+ if (status_ != NVPL_SPARSE_STATUS_SUCCESS) {
+ std::cerr << "nvpl_sparse_create_dn_vec failed with error: " << status_ << std::endl;
+ exit(1);
+ }
+ status_ = nvpl_sparse_create_dn_vec(Z_descr_,
+ m_,
+ z_,
+ dataType_);
+ if (status_ != NVPL_SPARSE_STATUS_SUCCESS) {
+ std::cerr << "nvpl_sparse_create_dn_vec failed with error: " << status_ << std::endl;
+ exit(1);
+ }
+ }
+
+private:
+ void preLoopRequirements() override {}
+
+ void callSpMDnV() override {
+ size_t bufferSize;
+ status_ = nvpl_sparse_spmv_buffer_size(handle_,
+ operation_,
+ &alpha,
+ A_descr_,
+ X_descr_,
+ &beta,
+ Z_descr_,
+ Y_descr_,
+ dataType_,
+ algorithm_,
+ description_,
+ &bufferSize);
+ if (status_ != NVPL_SPARSE_STATUS_SUCCESS) {
+ std::cer << "nvpl_sparse_spmv_buffer_size failed with error: " << status_ << std::endl;
+ exit(1);
+ }
+
+ void* externalBuffer = malloc(bufferSize);
+ status_ = nvpl_sparse_spmv_analysis(handle_,
+ operation_,
+ &alpha,
+ A_descr_,
+ X_descr_,
+ &beta,
+ Z_descr_,
+ Y_descr_,
+ dataType_,
+ algorithm_,
+ description_,
+ externalBuffer);
+ if (status_ != NVPL_SPARSE_STATUS_SUCCESS) {
+ std::cerr << "nvpl_sparse_spmv_analysis failed with error: " << status_ << std::endl;
+ exit(1);
+ }
+
+ status_ = nvpl_sparse_spmv(handle_,
+ operation_,
+ &alpha_,
+ A_descr_,
+ X_descr_,
+ &beta,
+ Z_descr_,
+ Y_descr_,
+ dataType_,
+ algorithm_,
+ description_);
+ if (status_ != NVPL_SPARSE_STATUS_SUCCESS) {
+ std::cerr << "nvpl_sparse_spmv failed with error: " << status_ << std::endl;
+ exit(1);
+ }
+
+ free(externalBuffer);
+ }
+
+ void postLoopRequirements() override {}
+
+ void postCallKernelCleanup() override {
+ free(x_);
+ free(y_);
+ free(z_);
+ free(A_rows_);
+ free(A_cols_);
+ free(A_vals_);
+ }
+
+ nvpl_sparse_status_t status_;
+ nvpl_sparse_handle_t handle_;
+ nvpl_Sparse_spmv_descr_t description_;
+
+ nvpl_sparse_spmv_alg_t algorithm_ = NVPL_SPARSE_SPMV_CSR_ALG1;
+ nvpl_sparse_operation_t operation_ = NVPL_SPARSE_OPERATION_NON_TRANSPOSE;
+ nvpl_sparse_data_type_t dataType_;
+ nvpl_sparse_index_type_t indexType_ = NVPL_SPARSE_INDEX_64I;
+ nvpl_sparse_index_base_t base_ = NVPL_SPARSE_INDEX_BASE_ZERO;
+
+ // Being a bit weird with the naming here.
+ // For consistency with the other libraries which don't have a
+ // seperate addition vector, I'm keeping Y as the output
+ // vector. Even though the NVPL documentation uses Y for
+ // the addition vector and Z for the output vector
+ nvpl_sparse_const_sp_mat_descr_t A_descr_;
+ nvpl_sparse_const_dn_vec_descr_t X_descr_;
+ nvpl_sparse_dn_vec_descr_t Z_descr_;
+ nvpl_sparse_dn_vec_descr_t Y_descr_;
+
+ // Arrays for Matrix A and unused addition vector Z
+ int64_t* A_vals_;
+ int64_t* A_cols_;
+ T* A_vals_;
+ T* z_;
+
+
+ const T alpha = ALPHA;
+ const T beta = BETA;
+};
+} // namespace cpu
+#endif
\ No newline at end of file
diff --git a/NVPL/spmspm.hh b/NVPL/spmspm.hh
new file mode 100644
index 0000000..ac63086
--- /dev/null
+++ b/NVPL/spmspm.hh
@@ -0,0 +1,30 @@
+#pragma once
+
+#ifdef CPU_NVPL
+
+#include "../include/kernels/CPU/spmspm.hh"
+#include "../include/utilities.hh"
+
+namespace cpu {
+template
+class spmspm_cpu : public spmspm {
+public:
+
+ void initialise(int m, int n, int k, double sparsity,
+ matrixType type, bool binary = false) {}
+
+protected:
+ void toSparseFormat() override {}
+
+private:
+ void preLoopRequirements() override {}
+
+ void callSpmspm() override {}
+
+ void postLoopRequirements() override {}
+
+ void postCallKernelCleanup() override {}
+};
+}
+
+#endif
diff --git a/README.md b/README.md
index d6f4161..1e6cd5e 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,10 @@ Only when an error occurs will any checksum be displayed to the user.
GFLOP/s are calculated using the following Total FLOPs formulas. The compute time excludes any initialisation, but does include any data movement / prefetching to/from the GPU device:
- **GEMM** : `FLOPs = (2 * M * N * K) + (b * M * N)` where `b` is `1` if BETA=0 and `3` if BETA=/=0
+ - **SPMDNM** : `FLOPs = (2 * N * NNZ)` where NNZ is the number of non-zero values in matrix A
+ - **SPMSPM** : `FLOPs = (NNZA * NNZB) / K` where NNZA is the number of non-zero values in matrix A and NNZ is the number of non-zero values in matrix B. This is an expectation of the number of flops based on a uniform distribution of non-zero values in the columns of matrix A and the rows of matrix B
- **GEMV** : `FLOPs = (2 * M * N) + (b * M)` where `b` is `1` if BETA=0 and `3` if BETA=/=0
+ - **SPMDNV** : `FLOPs = (2 * NNZ)` where NNZ is the number of non-zero values in matrix A
# Build Options
Select the compiler you wish to use. Regardless of choice, `gcc` is required in order to build the `Consume.so` external library.
@@ -126,18 +129,22 @@ The kernels listed below are computed by the benchmark for a wide range of probl
- FP32, FP64
- Square, short-&-wide, tall-&-thin input sizes
-
+ - Square, short-&-wide, tall-&-thin input sizes
+
+ - SpMSpM
+ - FP32, FP64
+ - Square, short-&-wide, tall-&-thin input sizes
### Level 2 BLAS
- GEMV
- FP32, FP64
- Square, short-&-wide, tall-&-thin input sizes
-
+ - Square, short-&-wide, tall-&-thin input sizes
# Auxiliary Files
Additional to the main benchmark, there are two auxiliary python scripts which perform the following:
@@ -146,7 +153,6 @@ Additional to the main benchmark, there are two auxiliary python scripts which p
# Future Work
- - [ ] Add support for Sparce Kernels
- [ ] Add FP16/BF16 support for kernels
- [ ] Add batched GEMM functions
- [ ] Add support for Apple Accelerate
diff --git a/calculateOffloadThreshold.py b/calculateOffloadThreshold.py
index 38c2646..43028c0 100644
--- a/calculateOffloadThreshold.py
+++ b/calculateOffloadThreshold.py
@@ -165,7 +165,7 @@ def printResults(once:offloadThreshold, always:offloadThreshold, unified:offload
gpuAlways.M = 0
gpuAlways.N = 0
gpuAlways.K = 0
- if(gpuUnified.M != 0 and float(cpu[8]) >= float(gpuU[8])):
+ if("gemm" in kernel and gpuUnified.M != 0 and float(cpu[8]) >= float(gpuU[8])):
# Do check to see if this is a momentary drop that we should ignore
if (prevGpuUgflops <= float(cpu[8])) and (float(gpuLines[2].split(',')[8]) <= float(cpu[8])):
gpuUnified.cpuGflops = 0.0
diff --git a/createFlopsPerSizeGraphs.py b/createFlopsPerSizeGraphs.py
new file mode 100644
index 0000000..1e50301
--- /dev/null
+++ b/createFlopsPerSizeGraphs.py
@@ -0,0 +1,988 @@
+import os
+import sys
+import matplotlib.pyplot as plt
+
+
+
+directory = "CSV_Results"
+# Get given CSV file directory
+if(len(sys.argv) > 1):
+ directory = sys.argv[1]
+
+outputDir = "Graphs_" + directory.replace('/', '_')
+
+# Check if CSV directory exists
+path = os.path.join(os.getcwd(), directory)
+if(not os.path.isdir(path)):
+ print("ERROR - {} directory does not exist. Cannot generate any graphs.".format(directory))
+ exit(1)
+
+# Get all filenames
+path = os.path.join(os.getcwd(), directory)
+filenames = os.listdir(path)
+
+# Make Graphs directory
+graphDir = os.path.join(os.getcwd(), outputDir)
+if(not os.path.isdir(graphDir)):
+ os.mkdir(graphDir)
+
+# ------------------------------ GEMV Graphs --------------------------------------------
+print("Creating GEMV graphs...")
+# Create GEMV graphs
+gemvFilenames = []
+for i in range(0, len(filenames)):
+ if "gemv_" in filenames[i] and "spgemv_" not in filenames[i]:
+ gemvFilenames.append(filenames[i])
+
+### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s
+for i in range(0, len(gemvFilenames)):
+ mn = []
+ iters = 0
+ kernel = ""
+ cpu_Gflops = []
+ gpuO_Gflops = []
+ gpuA_Gflops = []
+ gpuU_Gflops = []
+ prob_size = []
+
+ # Open file and get all lines
+ fName = os.path.join(os.getcwd(), directory, gemvFilenames[i])
+ openFile = open(fName, 'r')
+ lines = openFile.readlines()
+ lines.pop(0) # Remove headers
+ if len(lines) == 0 :
+ continue
+
+ # Get number of iterations performed and kernel name
+ line1 = lines[0].split(',')
+ iters = int(line1[7])
+ kernel = line1[1]
+
+ # Get gflops (y-axis) and MN values (x-axis) for CPU and all GPU types
+ for line in lines:
+ line = line.split(',')
+ # Get MN
+ if (len(mn) == 0) or ([line[2], line[3]] not in mn):
+ mn.append([line[2], line[3]])
+ # Get Gflops
+ gflops = float(line[-1].rstrip())
+ size = float(line[5].rstrip())
+ if line[0] == "cpu":
+ cpu_Gflops.append(gflops)
+ prob_size.append(size)
+ elif line[0] == "gpu_offloadOnce":
+ gpuO_Gflops.append(gflops)
+ elif line[0] == "gpu_offloadAlways":
+ gpuA_Gflops.append(gflops)
+ elif line[0] == "gpu_unified":
+ gpuU_Gflops.append(gflops)
+
+
+ # Create x-axis label and tick values
+ inputTypeStr = ""
+ x_name = ""
+ xVals = []
+ if "_square_vector_M=N" in gemvFilenames[i]:
+ x_name = "Value of M, N"
+ inputTypeStr = "Square x Vector (M=N)"
+ for j in range(0, len(mn)):
+ xVals.append(mn[j][0])
+ elif "_tall-thin_vector_M=16N" in gemvFilenames[i]:
+ x_name = "Value of N where M=16N"
+ inputTypeStr = "Tall-Thin x Vector (M=16N)"
+ for j in range(0, len(mn)):
+ xVals.append(mn[j][1])
+ elif "_tall-thin_vector_M_N=32" in gemvFilenames[i]:
+ x_name = "Value of M, where N=32"
+ inputTypeStr = "Tall-Thin x Vector (M, N=32)"
+ for j in range(0, len(mn)):
+ xVals.append(mn[j][0])
+ elif "_short-wide_vector_N=16M" in gemvFilenames[i]:
+ x_name = "Value of M, where N=16M"
+ inputTypeStr = "Short-Wide x Vector (N=16M)"
+ for j in range(0, len(mn)):
+ xVals.append(mn[j][0])
+ elif "_short-wide_vector_M=32_N" in gemvFilenames[i]:
+ x_name = "Value of N, where M=32"
+ inputTypeStr = "Short-Wide x Vector (M=32, N)"
+ for j in range(0, len(mn)):
+ xVals.append(mn[j][1])
+ else:
+ # File not supported so go to next file
+ continue
+
+ # Create y-axis label & graph title
+ y_name = ""
+ title = ""
+ fp = ""
+ if kernel == "sgemv" :
+ fp = "FP32"
+ elif kernel == "dgemv":
+ fp = "FP64"
+ y_name = "{} GFLOP/s".format(fp)
+ title = "{}GEMV Performance for {} Problems - {} iterations per problem size".format(kernel[0].upper(), inputTypeStr, iters)
+
+ # Make Graph
+ fig1 = plt.figure(figsize=(28,16))
+ ax1 = fig1.add_subplot()
+
+ gpuEnabled = False
+ if len(cpu_Gflops) > 0:
+ ax1.plot(xVals, cpu_Gflops, color="#332288", marker=".", label="CPU")
+ # Plot line at max GFLOP/s
+ yCoord = round(max(cpu_Gflops),1)
+ ax1.axhline(yCoord, color='black', linestyle='--')
+ ax1.text(x=0, y=yCoord, s="Max CPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+ if len(gpuO_Gflops) > 0:
+ ax1.plot(xVals, gpuO_Gflops, color="#44AA99", marker="x", label="GPU (Offload Once)")
+ gpuEnabled = True
+ if len(gpuA_Gflops) > 0:
+ ax1.plot(xVals, gpuA_Gflops, color="#CC6677", marker="+", label="GPU (Offload Always)")
+ gpuEnabled = True
+ if len(gpuU_Gflops) > 0:
+ ax1.plot(xVals, gpuU_Gflops, color="#DDCC77", marker=">", label="GPU (Unified Memory)")
+ gpuEnabled = True
+ if len(prob_size) > 0:
+ ax2 = ax1.twinx()
+ ax2.plot(xVals, prob_size, color="red", linestyle="--", marker="s", label="Problem Size (KiB)")
+ ax2.set_ylabel("Problem Size (KiB)", color="red", fontsize=14)
+ ax2.tick_params(axis='y', labelcolor="red")
+ ax2.set_ylim(min(prob_size) * 0.9, max(prob_size) * 1.1)
+ lines_1, labels_1 = ax1.get_legend_handles_labels()
+ lines_2, labels_2 = ax2.get_legend_handles_labels()
+ ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc="upper left")
+
+ if(gpuEnabled):
+ yCoord = round(max([max(gpuO_Gflops), max(gpuA_Gflops), max(gpuU_Gflops)]) ,1)
+ ax1.axhline(yCoord, color='black', linestyle='--')
+ ax1.text(x=0, y=yCoord, s="Max GPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+
+ # Set X ticks
+ NUM_TICK = 8
+ numXVals = len(xVals)
+ if numXVals < NUM_TICK:
+ # Print all labels
+ plt.xticks(ticks=range(0, numXVals, 1), labels=xVals, fontsize=20)
+ else:
+ # Calculate labels
+ locInterval = int((numXVals) / (NUM_TICK-1))
+ tickLocs = [0]
+ for q in range(1, (NUM_TICK-1)):
+ tickLocs.append(1 + (locInterval * q))
+ tickLocs.append(numXVals - 1)
+
+ labelInterval = int((int(xVals[-1]) - int(xVals[0])) / (NUM_TICK-1))
+ tickLabs = [xVals[0]]
+ for q in range(1, (NUM_TICK-1)):
+ tickLabs.append(int(xVals[0]) + (labelInterval * q))
+ tickLabs.append(int(xVals[-1]))
+
+ plt.xticks(ticks=tickLocs, labels=tickLabs, fontsize=20)
+
+ # Force setting of y-axis labels. If this isn't done then the range is weird...
+ yLoc, yLab = plt.yticks()
+ yLoc = yLoc.tolist()
+ # Remove negative first element of the list
+ if yLoc[0] != 0:
+ yLoc = yLoc[1:]
+ plt.ylim(0, yLoc[-1])
+ plt.yticks(ticks=yLoc, fontsize=20)
+
+ plt.margins(x=0.01, y=0.01)
+ leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18)
+ for obj in leg.legend_handles:
+ obj.set_linewidth(3.0)
+ obj.set_markersize(15.0)
+ obj.set_markeredgewidth(3.0)
+ plt.xlabel(x_name, fontsize=20)
+ plt.ylabel(y_name, fontsize=20)
+ plt.title(title, fontsize=20)
+ plt.savefig(fname="{}/{}.pdf".format(graphDir, gemvFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight")
+ plt.close('all')
+ print("\tPDF made")
+
+
+print("Finished!")
+# ---------------------------------------------------------------------------------------
+
+# ------------------------------ SpMDnV Graphs --------------------------------------------
+print("Creating SpMDnV graphs...")
+# Create GEMV graphs
+spmdnvFilenames = []
+for i in range(0, len(filenames)):
+ if "spmdnv_" in filenames[i]:
+ spmdnvFilenames.append(filenames[i])
+
+### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s
+for i in range(0, len(spmdnvFilenames)):
+ mn = []
+ iters = 0
+ kernel = ""
+ cpu_Gflops = []
+ gpuO_Gflops = []
+ gpuA_Gflops = []
+ gpuU_Gflops = []
+ prob_size = []
+
+ # Open file and get all lines
+ fName = os.path.join(os.getcwd(), directory, spmdnvFilenames[i])
+ openFile = open(fName, 'r')
+ lines = openFile.readlines()
+ lines.pop(0) # Remove headers
+ if len(lines) == 0 :
+ continue
+
+ # Get number of iterations performed and kernel name
+ line1 = lines[0].split(',')
+ iters = int(line1[7])
+ kernel = line1[1]
+
+ # Get gflops (y-axis) and MN values (x-axis) for CPU and all GPU types
+ for line in lines:
+ line = line.split(',')
+ # Get MN
+ if (len(mn) == 0) or ([line[2], line[3]] not in mn):
+ mn.append([line[2], line[3]]) # line[2] = M, line[3] = N
+ # Get Gflops
+ gflops = float(line[-1].rstrip())
+ size = float(line[5].rstrip())
+ if line[0] == "cpu":
+ cpu_Gflops.append(gflops)
+ prob_size.append(size)
+ elif line[0] == "gpu_offloadOnce":
+ gpuO_Gflops.append(gflops)
+ elif line[0] == "gpu_offloadAlways":
+ gpuA_Gflops.append(gflops)
+ elif line[0] == "gpu_unified":
+ gpuU_Gflops.append(gflops)
+
+
+ # Create x-axis label and tick values
+ inputTypeStr = ""
+ x_name = ""
+ xVals = []
+ if "_square_vector_M=N" in spmdnvFilenames[i]:
+ x_name = "Value of M, N"
+ inputTypeStr = "Square x Vector (M=N)"
+ for j in range(0, len(mn)):
+ xVals.append(mn[j][0])
+ elif "_tall-thin_vector_M=16N" in spmdnvFilenames[i]:
+ x_name = "Value of N where M=16N"
+ inputTypeStr = "Tall-Thin x Vector (M=16N)"
+ for j in range(0, len(mn)):
+ xVals.append(mn[j][1])
+ elif "_tall-thin_vector_M_N=32" in spmdnvFilenames[i]:
+ x_name = "Value of M, where N=32"
+ inputTypeStr = "Tall-Thin x Vector (M, N=32)"
+ for j in range(0, len(mn)):
+ xVals.append(mn[j][0])
+ elif "_short-wide_vector_N=16M" in spmdnvFilenames[i]:
+ x_name = "Value of M, where N=16M"
+ inputTypeStr = "Short-Wide x Vector (N=16M)"
+ for j in range(0, len(mn)):
+ xVals.append(mn[j][0])
+ elif "_short-wide_vector_M=32_N" in spmdnvFilenames[i]:
+ x_name = "Value of N, where M=32"
+ inputTypeStr = "Short-Wide x Vector (M=32, N)"
+ for j in range(0, len(mn)):
+ xVals.append(mn[j][1])
+ else:
+ # File not supported so go to next file
+ continue
+
+ # Create y-axis label & graph title
+ y_name = ""
+ title = ""
+ fp = ""
+ if kernel == "sspmdnv" :
+ fp = "FP32"
+ elif kernel == "dspmdnv":
+ fp = "FP64"
+ y_name = "{} GFLOP/s".format(fp)
+ title = "{}SpMDnV Performance for {} Problems - {} iterations per problem size".format(kernel[0].upper(), inputTypeStr, iters)
+
+ # Make Graph
+ fig1 = plt.figure(figsize=(28,16))
+ ax1 = fig1.add_subplot()
+
+ gpuEnabled = False
+ if len(cpu_Gflops) > 0:
+ ax1.plot(xVals, cpu_Gflops, color="#332288", marker=".", label="CPU")
+ # Plot line at max GFLOP/s
+ yCoord = round(max(cpu_Gflops),1)
+ ax1.axhline(yCoord, color='black', linestyle='--')
+ ax1.text(x=0, y=yCoord, s="Max CPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+ if len(gpuO_Gflops) > 0:
+ ax1.plot(xVals, gpuO_Gflops, color="#44AA99", marker="x", label="GPU (Offload Once)")
+ gpuEnabled = True
+ if len(gpuA_Gflops) > 0:
+ ax1.plot(xVals, gpuA_Gflops, color="#CC6677", marker="+", label="GPU (Offload Always)")
+ gpuEnabled = True
+ if len(gpuU_Gflops) > 0:
+ ax1.plot(xVals, gpuU_Gflops, color="#DDCC77", marker=">", label="GPU (Unified Memory)")
+ gpuEnabled = True
+ if len(prob_size) > 0:
+ ax2 = ax1.twinx()
+ ax2.plot(xVals, prob_size, color="red", linestyle="--", marker="s", label="Problem Size (KiB)")
+ ax2.set_ylabel("Problem Size (KiB)", color="red", fontsize=14)
+ ax2.tick_params(axis='y', labelcolor="red")
+ ax2.set_ylim(min(prob_size) * 0.9, max(prob_size) * 1.1)
+ lines_1, labels_1 = ax1.get_legend_handles_labels()
+ lines_2, labels_2 = ax2.get_legend_handles_labels()
+ ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc="upper left")
+
+ if(gpuEnabled):
+ yCoord = round(max([max(gpuO_Gflops), max(gpuA_Gflops), max(gpuU_Gflops)]) ,1)
+ ax1.axhline(yCoord, color='black', linestyle='--')
+ ax1.text(x=0, y=yCoord, s="Max GPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+
+ # Set X ticks
+ NUM_TICK = 8
+ numXVals = len(xVals)
+ if numXVals < NUM_TICK:
+ # Print all labels
+ plt.xticks(ticks=range(0, numXVals, 1), labels=xVals, fontsize=20)
+ else:
+ # Calculate labels
+ locInterval = int((numXVals) / (NUM_TICK-1))
+ tickLocs = [0]
+ for q in range(1, (NUM_TICK-1)):
+ tickLocs.append(1 + (locInterval * q))
+ tickLocs.append(numXVals - 1)
+
+ labelInterval = int((int(xVals[-1]) - int(xVals[0])) / (NUM_TICK-1))
+ tickLabs = [xVals[0]]
+ for q in range(1, (NUM_TICK-1)):
+ tickLabs.append(int(xVals[0]) + (labelInterval * q))
+ tickLabs.append(int(xVals[-1]))
+
+ plt.xticks(ticks=tickLocs, labels=tickLabs, fontsize=20)
+
+ # Force setting of y-axis labels. If this isn't done then the range is weird...
+ yLoc, yLab = plt.yticks()
+ yLoc = yLoc.tolist()
+ # Remove negative first element of the list
+ if yLoc[0] != 0:
+ yLoc = yLoc[1:]
+ plt.ylim(0, yLoc[-1])
+ plt.yticks(ticks=yLoc, fontsize=20)
+
+ plt.margins(x=0.01, y=0.01)
+ leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18)
+ for obj in leg.legend_handles:
+ obj.set_linewidth(3.0)
+ obj.set_markersize(15.0)
+ obj.set_markeredgewidth(3.0)
+ plt.xlabel(x_name, fontsize=20)
+ plt.ylabel(y_name, fontsize=20)
+ plt.title(title, fontsize=20)
+ plt.savefig(fname="{}/{}.pdf".format(graphDir, spmdnvFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight")
+ plt.close('all')
+ print("\tPDF made")
+
+
+print("Finished!")
+# ---------------------------------------------------------------------------------------
+
+# ------------------------------ GEMM Graphs --------------------------------------------
+print("Creating GEMM graphs...")
+# Create GEMM graphs
+gemmFilenames = []
+for i in range(0, len(filenames)):
+ if "gemm_" in filenames[i] and "spgemm_" not in filenames[i]:
+ gemmFilenames.append(filenames[i])
+
+### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s
+for i in range(0, len(gemmFilenames)):
+ mnk = []
+ iters = 0
+ kernel = ""
+ cpu_Gflops = []
+ gpuO_Gflops = []
+ gpuA_Gflops = []
+ gpuU_Gflops = []
+ prob_size = []
+
+ # Open file and get all lines
+ fName = os.path.join(os.getcwd(), directory, gemmFilenames[i])
+ openFile = open(fName, 'r')
+ lines = openFile.readlines()
+ lines.pop(0) # Remove headers
+ if len(lines) == 0 :
+ continue
+
+ # Get number of iterations performed and kernel name
+ line1 = lines[0].split(',')
+ sparsity = float(line1[6])
+ iters = int(line1[7])
+ kernel = line1[1]
+
+ # Get gflops (y-axis) and MNK values (x-axis) for CPU and all GPU types
+ for line in lines:
+ line = line.split(',')
+ # Get MNK
+ if (len(mnk) == 0) or ([line[2], line[3], line[4]] not in mnk):
+ mnk.append([line[2], line[3], line[4]])
+ # Get Gflops
+ gflops = float(line[-1].rstrip())
+ size = float(line[5].rstrip())
+ if line[0] == "cpu":
+ cpu_Gflops.append(gflops)
+ prob_size.append(size)
+ elif line[0] == "gpu_offloadOnce":
+ gpuO_Gflops.append(gflops)
+ elif line[0] == "gpu_offloadAlways":
+ gpuA_Gflops.append(gflops)
+ elif line[0] == "gpu_unified":
+ gpuU_Gflops.append(gflops)
+
+ # Create x-axis label and tick values
+ inputTypeStr = ""
+ x_name = ""
+ xVals = []
+ if "_square_square_M=N=K" in gemmFilenames[i]:
+ x_name = "Value of M, N, K"
+ inputTypeStr = "Square x Square (M=N=K)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][0])
+ elif "_tall-thin_short-wide_M=N_M=16K" in gemmFilenames[i]:
+ x_name = "Value of K where M=16K and N=16K"
+ inputTypeStr = "Tall-Thin x Short-Wide (M=N=16K)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][2])
+ elif "_tall-thin_short-wide_M=N_K=32" in gemmFilenames[i]:
+ x_name = "Value of M and N, where K=32"
+ inputTypeStr = "Tall-Thin x Short-Wide (M=N, K=32)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][0])
+ elif "_short-wide_tall-thin_M=N_K=16M" in gemmFilenames[i]:
+ x_name = "Value of M and N, where K=16M"
+ inputTypeStr = "Short-Wide x Tall-Thin (M=N, K=16M)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][0])
+ elif "_short-wide_tall-thin_M=N=32_K" in gemmFilenames[i]:
+ x_name = "Value of K, where M=32 and N=32"
+ inputTypeStr = "Short-Wide x Tall-Thin (M=N=32, K)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][2])
+ elif "_tall-thin_square_K=N_M=16K" in gemmFilenames[i]:
+ x_name = "Value of N and K, where M=16K"
+ inputTypeStr = "Tall-Thin x Square (N=K, M=16K)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][2])
+ elif "_tall-thin_square_K=N=32_M" in gemmFilenames[i]:
+ x_name = "Value of M, where N=32 and K=32"
+ inputTypeStr = "Tall-Thin x Square (M, N=K=32)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][0])
+ elif "_square_short-wide_M=K_N=16K" in gemmFilenames[i]:
+ x_name = "Value of M and K, where N=16K"
+ inputTypeStr = "Square x Short-Wide (M=K, N=16K)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][0])
+ elif "_square_short-wide_M=K=32_N" in gemmFilenames[i]:
+ x_name = "Value of N, where M=32 and K=32"
+ inputTypeStr = "Square x Short-Wide (M=K=32, N)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][1])
+ else:
+ # File not supported so go to next file
+ continue
+
+ # Create y-axis label & graph title
+ y_name = ""
+ title = ""
+ fp = ""
+ if kernel == "sgemm" :
+ fp = "FP32"
+ elif kernel == "dgemm":
+ fp = "FP64"
+ y_name = "{} GFLOP/s".format(fp)
+ title = ("{}GEMM Performance for {} Problems (sparsity = {})- {} "
+ "iterations per problemize").format(kernel[0].upper(),
+ inputTypeStr, sparsity, iters)
+
+ # Make Graph
+ fig1 = plt.figure(figsize=(28,16))
+ ax1 = fig1.add_subplot()
+
+ gpuEnabled = False
+ if len(cpu_Gflops) > 0:
+ ax1.plot(xVals, cpu_Gflops, color="#332288", marker=".", label="CPU")
+ # Plot line at max GFLOP/s
+ yCoord = round(max(cpu_Gflops),1)
+ ax1.axhline(yCoord, color='black', linestyle='--')
+ ax1.text(x=0, y=yCoord, s="Max CPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+ if len(gpuO_Gflops) > 0:
+ ax1.plot(xVals, gpuO_Gflops, color="#44AA99", marker="x", label="GPU (Offload Once)")
+ gpuEnabled = True
+ if len(gpuA_Gflops) > 0:
+ ax1.plot(xVals, gpuA_Gflops, color="#CC6677", marker="+", label="GPU (Offload Always)")
+ gpuEnabled = True
+ if len(gpuU_Gflops) > 0:
+ ax1.plot(xVals, gpuU_Gflops, color="#DDCC77", marker=">", label="GPU (Unified Memory)")
+ gpuEnabled = True
+ if len(prob_size) > 0:
+ ax2 = ax1.twinx()
+ ax2.plot(xVals, prob_size, color="red", linestyle="--", marker="s", label="Problem Size (KiB)")
+ ax2.set_ylabel("Problem Size (KiB)", color="red", fontsize=14)
+ ax2.tick_params(axis='y', labelcolor="red")
+ ax2.set_ylim(min(prob_size) * 0.9, max(prob_size) * 1.1)
+ lines_1, labels_1 = ax1.get_legend_handles_labels()
+ lines_2, labels_2 = ax2.get_legend_handles_labels()
+ ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc="upper left")
+
+ if(gpuEnabled):
+ yCoord = round(max([max(gpuO_Gflops), max(gpuA_Gflops), max(gpuU_Gflops)]) ,1)
+ ax1.axhline(yCoord, color='black', linestyle='--')
+ ax1.text(x=0, y=yCoord, s="Max GPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+
+ # Set X ticks
+ NUM_TICK = 8
+ numXVals = len(xVals)
+ if numXVals < NUM_TICK:
+ # Print all labels
+ plt.xticks(ticks=range(0, numXVals, 1), labels=xVals, fontsize=20)
+ else:
+ # Calculate labels
+ locInterval = int((numXVals) / (NUM_TICK-1))
+ tickLocs = [0]
+ for q in range(1, (NUM_TICK-1)):
+ tickLocs.append(1 + (locInterval * q))
+ tickLocs.append(numXVals - 1)
+
+ labelInterval = int((int(xVals[-1]) - int(xVals[0])) / (NUM_TICK-1))
+ tickLabs = [xVals[0]]
+ for q in range(1, (NUM_TICK-1)):
+ tickLabs.append(int(xVals[0]) + (labelInterval * q))
+ tickLabs.append(int(xVals[-1]))
+
+ plt.xticks(ticks=tickLocs, labels=tickLabs, fontsize=20)
+
+ # Force setting of y-axis labels. If this isn't done then the range is weird...
+ yLoc, yLab = plt.yticks()
+ yLoc = yLoc.tolist()
+ # Remove negative first element of the list
+ if yLoc[0] != 0:
+ yLoc = yLoc[1:]
+ plt.ylim(0, yLoc[-1])
+ plt.yticks(ticks=yLoc, fontsize=20)
+
+ plt.margins(x=0.01, y=0.01)
+ leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18)
+ for obj in leg.legend_handles:
+ obj.set_linewidth(3.0)
+ obj.set_markersize(15.0)
+ obj.set_markeredgewidth(3.0)
+ plt.xlabel(x_name, fontsize=20)
+ plt.ylabel(y_name, fontsize=20)
+ plt.title(title, fontsize=20)
+ plt.savefig(fname="{}/{}.pdf".format(graphDir, gemmFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight")
+ plt.close('all')
+ print("\tPDF made")
+
+
+print("Finished!")
+# ---------------------------------------------------------------------------------------
+
+# ------------------------------ SpGEMM Graphs --------------------------------------------
+print("Creating SpMDnM graphs...")
+# Create SpMDnM graphs
+spmdnmFilenames = []
+for i in range(0, len(filenames)):
+ if "spmdnm_" in filenames[i]:
+ spmdnmFilenames.append(filenames[i])
+
+### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s
+for i in range(0, len(spmdnmFilenames)):
+ mnk = []
+ iters = 0
+ kernel = ""
+ cpu_Gflops = []
+ gpuO_Gflops = []
+ gpuA_Gflops = []
+ gpuU_Gflops = []
+ prob_size = []
+
+ # Open file and get all lines
+ fName = os.path.join(os.getcwd(), directory, spmdnmFilenames[i])
+ openFile = open(fName, 'r')
+ lines = openFile.readlines()
+ lines.pop(0) # Remove headers
+ if len(lines) == 0 :
+ continue
+
+ # Get number of iterations performed and kernel name
+ line1 = lines[0].split(',')
+ sparsity = float(line1[6])
+ iters = int(line1[7])
+ kernel = line1[1]
+
+ # Get gflops (y-axis) and MNK values (x-axis) for CPU and all GPU types
+ for line in lines:
+ line = line.split(',')
+ # Get MNK
+ if (len(mnk) == 0) or ([line[2], line[3], line[4]] not in mnk):
+ mnk.append([line[2], line[3], line[4]])
+ # Get Gflops
+ gflops = float(line[-1].rstrip())
+ size = float(line[5].rstrip())
+ if line[0] == "cpu":
+ cpu_Gflops.append(gflops)
+ prob_size.append(size)
+ elif line[0] == "gpu_offloadOnce":
+ gpuO_Gflops.append(gflops)
+ elif line[0] == "gpu_offloadAlways":
+ gpuA_Gflops.append(gflops)
+ elif line[0] == "gpu_unified":
+ gpuU_Gflops.append(gflops)
+
+ # Create x-axis label and tick values
+ inputTypeStr = ""
+ x_name = ""
+ xVals = []
+ if "_square_square_M=N=K" in spmdnmFilenames[i]:
+ x_name = "Value of M, N, K"
+ inputTypeStr = "Square x Square (M=N=K)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][0])
+ elif "_tall-thin_short-wide_M=N_M=16K" in spmdnmFilenames[i]:
+ x_name = "Value of K where M=16K and N=16K"
+ inputTypeStr = "Tall-Thin x Short-Wide (M=N=16K)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][2])
+ elif "_tall-thin_short-wide_M=N_K=32" in spmdnmFilenames[i]:
+ x_name = "Value of M and N, where K=32"
+ inputTypeStr = "Tall-Thin x Short-Wide (M=N, K=32)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][0])
+ elif "_short-wide_tall-thin_M=N_K=16M" in spmdnmFilenames[i]:
+ x_name = "Value of M and N, where K=16M"
+ inputTypeStr = "Short-Wide x Tall-Thin (M=N, K=16M)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][0])
+ elif "_short-wide_tall-thin_M=N=32_K" in spmdnmFilenames[i]:
+ x_name = "Value of K, where M=32 and N=32"
+ inputTypeStr = "Short-Wide x Tall-Thin (M=N=32, K)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][2])
+ elif "_tall-thin_square_K=N_M=16K" in spmdnmFilenames[i]:
+ x_name = "Value of N and K, where M=16K"
+ inputTypeStr = "Tall-Thin x Square (N=K, M=16K)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][2])
+ elif "_tall-thin_square_K=N=32_M" in spmdnmFilenames[i]:
+ x_name = "Value of M, where N=32 and K=32"
+ inputTypeStr = "Tall-Thin x Square (M, N=K=32)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][0])
+ elif "_square_short-wide_M=K_N=16K" in spmdnmFilenames[i]:
+ x_name = "Value of M and K, where N=16K"
+ inputTypeStr = "Square x Short-Wide (M=K, N=16K)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][0])
+ elif "_square_short-wide_M=K=32_N" in spmdnmFilenames[i]:
+ x_name = "Value of N, where M=32 and K=32"
+ inputTypeStr = "Square x Short-Wide (M=K=32, N)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][1])
+ else:
+ # File not supported so go to next file
+ continue
+
+ # Create y-axis label & graph title
+ y_name = ""
+ title = ""
+ fp = ""
+ if kernel == "sspmdnm" :
+ fp = "FP32"
+ elif kernel == "dspmdnm":
+ fp = "FP64"
+ y_name = "{} GFLOP/s".format(fp)
+ title = ("{}SpMDnM Performance for {} Problems (sparsity = {})- {} "
+ "iterations per problemize").format(kernel[0].upper(),
+ inputTypeStr, sparsity, iters)
+
+ # Make Graph
+ fig1 = plt.figure(figsize=(28,16))
+ ax1 = fig1.add_subplot()
+
+ gpuEnabled = False
+ if len(cpu_Gflops) > 0:
+ ax1.plot(xVals, cpu_Gflops, color="#332288", marker=".", label="CPU")
+ # Plot line at max GFLOP/s
+ yCoord = round(max(cpu_Gflops),1)
+ ax1.axhline(yCoord, color='black', linestyle='--')
+ ax1.text(x=0, y=yCoord, s="Max CPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+ if len(gpuO_Gflops) > 0:
+ ax1.plot(xVals, gpuO_Gflops, color="#44AA99", marker="x", label="GPU (Offload Once)")
+ gpuEnabled = True
+ if len(gpuA_Gflops) > 0:
+ ax1.plot(xVals, gpuA_Gflops, color="#CC6677", marker="+", label="GPU (Offload Always)")
+ gpuEnabled = True
+ if len(gpuU_Gflops) > 0:
+ ax1.plot(xVals, gpuU_Gflops, color="#DDCC77", marker=">", label="GPU (Unified Memory)")
+ gpuEnabled = True
+ if len(prob_size) > 0:
+ ax2 = ax1.twinx()
+ ax2.plot(xVals, prob_size, color="red", linestyle="--", marker="s", label="Problem Size (KiB)")
+ ax2.set_ylabel("Problem Size (KiB)", color="red", fontsize=14)
+ ax2.tick_params(axis='y', labelcolor="red")
+ ax2.set_ylim(min(prob_size) * 0.9, max(prob_size) * 1.1)
+ lines_1, labels_1 = ax1.get_legend_handles_labels()
+ lines_2, labels_2 = ax2.get_legend_handles_labels()
+ ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc="upper left")
+
+ if(gpuEnabled):
+ yCoord = round(max([max(gpuO_Gflops), max(gpuA_Gflops), max(gpuU_Gflops)]) ,1)
+ ax1.axhline(yCoord, color='black', linestyle='--')
+ ax1.text(x=0, y=yCoord, s="Max GPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+
+ # Set X ticks
+ NUM_TICK = 8
+ numXVals = len(xVals)
+ if numXVals < NUM_TICK:
+ # Print all labels
+ plt.xticks(ticks=range(0, numXVals, 1), labels=xVals, fontsize=20)
+ else:
+ # Calculate labels
+ locInterval = int((numXVals) / (NUM_TICK-1))
+ tickLocs = [0]
+ for q in range(1, (NUM_TICK-1)):
+ tickLocs.append(1 + (locInterval * q))
+ tickLocs.append(numXVals - 1)
+
+ labelInterval = int((int(xVals[-1]) - int(xVals[0])) / (NUM_TICK-1))
+ tickLabs = [xVals[0]]
+ for q in range(1, (NUM_TICK-1)):
+ tickLabs.append(int(xVals[0]) + (labelInterval * q))
+ tickLabs.append(int(xVals[-1]))
+
+ plt.xticks(ticks=tickLocs, labels=tickLabs, fontsize=20)
+
+ # Force setting of y-axis labels. If this isn't done then the range is weird...
+ yLoc, yLab = plt.yticks()
+ yLoc = yLoc.tolist()
+ # Remove negative first element of the list
+ if yLoc[0] != 0:
+ yLoc = yLoc[1:]
+ plt.ylim(0, yLoc[-1])
+ plt.yticks(ticks=yLoc, fontsize=20)
+
+ plt.margins(x=0.01, y=0.01)
+ leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18)
+ for obj in leg.legend_handles:
+ obj.set_linewidth(3.0)
+ obj.set_markersize(15.0)
+ obj.set_markeredgewidth(3.0)
+ plt.xlabel(x_name, fontsize=20)
+ plt.ylabel(y_name, fontsize=20)
+ plt.title(title, fontsize=20)
+ plt.savefig(fname="{}/{}.pdf".format(graphDir, spmdnmFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight")
+ plt.close('all')
+ print("\tPDF made")
+
+
+print("Finished!")
+# ---------------------------------------------------------------------------------------
+
+# ------------------------------ SpMSpM Graphs --------------------------------------------
+print("Creating SpMSpM graphs...")
+# Create SpMSpM graphs
+spmspmFilenames = []
+for i in range(0, len(filenames)):
+ if "spmspm_" in filenames[i]:
+ spmspmFilenames.append(filenames[i])
+
+### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s
+for i in range(0, len(spmspmFilenames)):
+ mnk = []
+ iters = 0
+ kernel = ""
+ cpu_Gflops = []
+ gpuO_Gflops = []
+ gpuA_Gflops = []
+ gpuU_Gflops = []
+ prob_size = []
+
+ # Open file and get all lines
+ fName = os.path.join(os.getcwd(), directory, spmspmFilenames[i])
+ openFile = open(fName, 'r')
+ lines = openFile.readlines()
+ lines.pop(0) # Remove headers
+ if len(lines) == 0 :
+ continue
+
+ # Get number of iterations performed and kernel name
+ line1 = lines[0].split(',')
+ sparsity = float(line1[6])
+ iters = int(line1[7])
+ kernel = line1[1]
+
+ # Get gflops (y-axis) and MNK values (x-axis) for CPU and all GPU types
+ for line in lines:
+ line = line.split(',')
+ # Get MNK
+ if (len(mnk) == 0) or ([line[2], line[3], line[4]] not in mnk):
+ mnk.append([line[2], line[3], line[4]])
+ # Get Gflops
+ gflops = float(line[-1].rstrip())
+ size = float(line[5].rstrip())
+ if line[0] == "cpu":
+ cpu_Gflops.append(gflops)
+ prob_size.append(size)
+ elif line[0] == "gpu_offloadOnce":
+ gpuO_Gflops.append(gflops)
+ elif line[0] == "gpu_offloadAlways":
+ gpuA_Gflops.append(gflops)
+ elif line[0] == "gpu_unified":
+ gpuU_Gflops.append(gflops)
+
+ # Create x-axis label and tick values
+ inputTypeStr = ""
+ x_name = ""
+ xVals = []
+ if "_square_square_M=N=K" in spmspmFilenames[i]:
+ x_name = "Value of M, N, K"
+ inputTypeStr = "Square x Square (M=N=K)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][0])
+ elif "_tall-thin_short-wide_M=N_M=16K" in spmspmFilenames[i]:
+ x_name = "Value of K where M=16K and N=16K"
+ inputTypeStr = "Tall-Thin x Short-Wide (M=N=16K)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][2])
+ elif "_tall-thin_short-wide_M=N_K=32" in spmspmFilenames[i]:
+ x_name = "Value of M and N, where K=32"
+ inputTypeStr = "Tall-Thin x Short-Wide (M=N, K=32)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][0])
+ elif "_short-wide_tall-thin_M=N_K=16M" in spmspmFilenames[i]:
+ x_name = "Value of M and N, where K=16M"
+ inputTypeStr = "Short-Wide x Tall-Thin (M=N, K=16M)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][0])
+ elif "_short-wide_tall-thin_M=N=32_K" in spmspmFilenames[i]:
+ x_name = "Value of K, where M=32 and N=32"
+ inputTypeStr = "Short-Wide x Tall-Thin (M=N=32, K)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][2])
+ elif "_tall-thin_square_K=N_M=16K" in spmspmFilenames[i]:
+ x_name = "Value of N and K, where M=16K"
+ inputTypeStr = "Tall-Thin x Square (N=K, M=16K)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][2])
+ elif "_tall-thin_square_K=N=32_M" in spmspmFilenames[i]:
+ x_name = "Value of M, where N=32 and K=32"
+ inputTypeStr = "Tall-Thin x Square (M, N=K=32)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][0])
+ elif "_square_short-wide_M=K_N=16K" in spmspmFilenames[i]:
+ x_name = "Value of M and K, where N=16K"
+ inputTypeStr = "Square x Short-Wide (M=K, N=16K)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][0])
+ elif "_square_short-wide_M=K=32_N" in spmspmFilenames[i]:
+ x_name = "Value of N, where M=32 and K=32"
+ inputTypeStr = "Square x Short-Wide (M=K=32, N)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][1])
+ else:
+ # File not supported so go to next file
+ continue
+
+ # Create y-axis label & graph title
+ y_name = ""
+ title = ""
+ fp = ""
+ if kernel == "sspmspm" :
+ fp = "FP32"
+ elif kernel == "dspmspm":
+ fp = "FP64"
+ y_name = "{} GFLOP/s".format(fp)
+ title = ("{}SpMSpM Performance for {} Problems (sparsity = {})- {} "
+ "iterations per problemize").format(kernel[0].upper(),
+ inputTypeStr, sparsity, iters)
+
+ # Make Graph
+ fig1 = plt.figure(figsize=(28,16))
+ ax1 = fig1.add_subplot()
+
+ gpuEnabled = False
+ if len(cpu_Gflops) > 0:
+ ax1.plot(xVals, cpu_Gflops, color="#332288", marker=".", label="CPU")
+ # Plot line at max GFLOP/s
+ yCoord = round(max(cpu_Gflops),1)
+ ax1.axhline(yCoord, color='black', linestyle='--')
+ ax1.text(x=0, y=yCoord, s="Max CPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+ if len(gpuO_Gflops) > 0:
+ ax1.plot(xVals, gpuO_Gflops, color="#44AA99", marker="x", label="GPU (Offload Once)")
+ gpuEnabled = True
+ if len(gpuA_Gflops) > 0:
+ ax1.plot(xVals, gpuA_Gflops, color="#CC6677", marker="+", label="GPU (Offload Always)")
+ gpuEnabled = True
+ if len(gpuU_Gflops) > 0:
+ ax1.plot(xVals, gpuU_Gflops, color="#DDCC77", marker=">", label="GPU (Unified Memory)")
+ gpuEnabled = True
+ if len(prob_size) > 0:
+ ax2 = ax1.twinx()
+ ax2.plot(xVals, prob_size, color="red", linestyle="--", marker="s", label="Problem Size (KiB)")
+ ax2.set_ylabel("Problem Size (KiB)", color="red", fontsize=14)
+ ax2.tick_params(axis='y', labelcolor="red")
+ ax2.set_ylim(min(prob_size) * 0.9, max(prob_size) * 1.1)
+ lines_1, labels_1 = ax1.get_legend_handles_labels()
+ lines_2, labels_2 = ax2.get_legend_handles_labels()
+ ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc="upper left")
+
+ if(gpuEnabled):
+ yCoord = round(max([max(gpuO_Gflops), max(gpuA_Gflops), max(gpuU_Gflops)]) ,1)
+ ax1.axhline(yCoord, color='black', linestyle='--')
+ ax1.text(x=0, y=yCoord, s="Max GPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+
+ # Set X ticks
+ NUM_TICK = 8
+ numXVals = len(xVals)
+ if numXVals < NUM_TICK:
+ # Print all labels
+ plt.xticks(ticks=range(0, numXVals, 1), labels=xVals, fontsize=20)
+ else:
+ # Calculate labels
+ locInterval = int((numXVals) / (NUM_TICK-1))
+ tickLocs = [0]
+ for q in range(1, (NUM_TICK-1)):
+ tickLocs.append(1 + (locInterval * q))
+ tickLocs.append(numXVals - 1)
+
+ labelInterval = int((int(xVals[-1]) - int(xVals[0])) / (NUM_TICK-1))
+ tickLabs = [xVals[0]]
+ for q in range(1, (NUM_TICK-1)):
+ tickLabs.append(int(xVals[0]) + (labelInterval * q))
+ tickLabs.append(int(xVals[-1]))
+
+ plt.xticks(ticks=tickLocs, labels=tickLabs, fontsize=20)
+
+ # Force setting of y-axis labels. If this isn't done then the range is weird...
+ yLoc, yLab = plt.yticks()
+ yLoc = yLoc.tolist()
+ # Remove negative first element of the list
+ if yLoc[0] != 0:
+ yLoc = yLoc[1:]
+ plt.ylim(0, yLoc[-1])
+ plt.yticks(ticks=yLoc, fontsize=20)
+
+ plt.margins(x=0.01, y=0.01)
+ leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18)
+ for obj in leg.legend_handles:
+ obj.set_linewidth(3.0)
+ obj.set_markersize(15.0)
+ obj.set_markeredgewidth(3.0)
+ plt.xlabel(x_name, fontsize=20)
+ plt.ylabel(y_name, fontsize=20)
+ plt.title(title, fontsize=20)
+ plt.savefig(fname="{}/{}.pdf".format(graphDir, spmspmFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight")
+ plt.close('all')
+ print("\tPDF made")
+
+
+print("Finished!")
+# ---------------------------------------------------------------------------------------
diff --git a/createGflopsGraphs.py b/createGflopsGraphs.py
index 0ed7772..8108812 100644
--- a/createGflopsGraphs.py
+++ b/createGflopsGraphs.py
@@ -26,12 +26,346 @@
if(not os.path.isdir(graphDir)):
os.mkdir(graphDir)
+# ------------------------------ GEMV Graphs --------------------------------------------
+print("Creating GEMV graphs...")
+# Create GEMV graphs
+gemvFilenames = []
+for i in range(0, len(filenames)):
+ if "gemv_" in filenames[i] and "spgemv_" not in filenames[i]:
+ gemvFilenames.append(filenames[i])
+
+### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s
+for i in range(0, len(gemvFilenames)):
+ mn = []
+ iters = 0
+ kernel = ""
+ cpu_Gflops = []
+ gpuO_Gflops = []
+ gpuA_Gflops = []
+ gpuU_Gflops = []
+
+ # Open file and get all lines
+ fName = os.path.join(os.getcwd(), directory, gemvFilenames[i])
+ openFile = open(fName, 'r')
+ lines = openFile.readlines()
+ lines.pop(0) # Remove headers
+ if len(lines) == 0 :
+ continue
+
+ # Get number of iterations performed and kernel name
+ line1 = lines[0].split(',')
+ iters = int(line1[7])
+ kernel = line1[1]
+
+ # Get gflops (y-axis) and MN values (x-axis) for CPU and all GPU types
+ for line in lines:
+ line = line.split(',')
+ # Get MN
+ if (len(mn) == 0) or ([line[2], line[3]] not in mn):
+ mn.append([line[2], line[3]])
+ # Get Gflops
+ gflops = float(line[-1].rstrip())
+ if line[0] == "cpu":
+ cpu_Gflops.append(gflops)
+ elif line[0] == "gpu_offloadOnce":
+ gpuO_Gflops.append(gflops)
+ elif line[0] == "gpu_offloadAlways":
+ gpuA_Gflops.append(gflops)
+ elif line[0] == "gpu_unified":
+ gpuU_Gflops.append(gflops)
+
+
+ # Create x-axis label and tick values
+ inputTypeStr = ""
+ x_name = ""
+ xVals = []
+ if "_square_vector_M=N" in gemvFilenames[i]:
+ x_name = "Value of M, N"
+ inputTypeStr = "Square x Vector (M=N)"
+ for j in range(0, len(mn)):
+ xVals.append(mn[j][0])
+ elif "_tall-thin_vector_M=16N" in gemvFilenames[i]:
+ x_name = "Value of N where M=16N"
+ inputTypeStr = "Tall-Thin x Vector (M=16N)"
+ for j in range(0, len(mn)):
+ xVals.append(mn[j][1])
+ elif "_tall-thin_vector_M_N=32" in gemvFilenames[i]:
+ x_name = "Value of M, where N=32"
+ inputTypeStr = "Tall-Thin x Vector (M, N=32)"
+ for j in range(0, len(mn)):
+ xVals.append(mn[j][0])
+ elif "_short-wide_vector_N=16M" in gemvFilenames[i]:
+ x_name = "Value of M, where N=16M"
+ inputTypeStr = "Short-Wide x Vector (N=16M)"
+ for j in range(0, len(mn)):
+ xVals.append(mn[j][0])
+ elif "_short-wide_vector_M=32_N" in gemvFilenames[i]:
+ x_name = "Value of N, where M=32"
+ inputTypeStr = "Short-Wide x Vector (M=32, N)"
+ for j in range(0, len(mn)):
+ xVals.append(mn[j][1])
+ else:
+ # File not supported so go to next file
+ continue
+
+ # Create y-axis label & graph title
+ y_name = ""
+ title = ""
+ fp = ""
+ if kernel == "sgemv" :
+ fp = "FP32"
+ elif kernel == "dgemv":
+ fp = "FP64"
+ y_name = "{} GFLOP/s".format(fp)
+ title = "{}GEMV Performance for {} Problems - {} iterations per problem size".format(kernel[0].upper(), inputTypeStr, iters)
+
+ # Make Graph
+ fig1 = plt.figure(figsize=(28,16))
+ ax1 = fig1.add_subplot()
+
+ gpuEnabled = False
+ if len(cpu_Gflops) > 0:
+ ax1.plot(xVals, cpu_Gflops, color="#332288", marker=".", label="CPU")
+ # Plot line at max GFLOP/s
+ yCoord = round(max(cpu_Gflops),1)
+ ax1.axhline(yCoord, color='black', linestyle='--')
+ ax1.text(x=0, y=yCoord, s="Max CPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+ if len(gpuO_Gflops) > 0:
+ ax1.plot(xVals, gpuO_Gflops, color="#44AA99", marker="x", label="GPU (Offload Once)")
+ gpuEnabled = True
+ if len(gpuA_Gflops) > 0:
+ ax1.plot(xVals, gpuA_Gflops, color="#CC6677", marker="+", label="GPU (Offload Always)")
+ gpuEnabled = True
+ if len(gpuU_Gflops) > 0:
+ ax1.plot(xVals, gpuU_Gflops, color="#DDCC77", marker=">", label="GPU (Unified Memory)")
+ gpuEnabled = True
+
+ if(gpuEnabled):
+ yCoord = round(max([max(gpuO_Gflops), max(gpuA_Gflops), max(gpuU_Gflops)]) ,1)
+ ax1.axhline(yCoord, color='black', linestyle='--')
+ ax1.text(x=0, y=yCoord, s="Max GPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+
+ # Set X ticks
+ NUM_TICK = 8
+ numXVals = len(xVals)
+ if numXVals < NUM_TICK:
+ # Print all labels
+ plt.xticks(ticks=range(0, numXVals, 1), labels=xVals, fontsize=20)
+ else:
+ # Calculate labels
+ locInterval = int((numXVals) / (NUM_TICK-1))
+ tickLocs = [0]
+ for q in range(1, (NUM_TICK-1)):
+ tickLocs.append(1 + (locInterval * q))
+ tickLocs.append(numXVals - 1)
+
+ labelInterval = int((int(xVals[-1]) - int(xVals[0])) / (NUM_TICK-1))
+ tickLabs = [xVals[0]]
+ for q in range(1, (NUM_TICK-1)):
+ tickLabs.append(int(xVals[0]) + (labelInterval * q))
+ tickLabs.append(int(xVals[-1]))
+
+ plt.xticks(ticks=tickLocs, labels=tickLabs, fontsize=20)
+
+ # Force setting of y-axis labels. If this isn't done then the range is weird...
+ yLoc, yLab = plt.yticks()
+ yLoc = yLoc.tolist()
+ # Remove negative first element of the list
+ if yLoc[0] != 0:
+ yLoc = yLoc[1:]
+ plt.ylim(0, yLoc[-1])
+ plt.yticks(ticks=yLoc, fontsize=20)
+
+ plt.margins(x=0.01, y=0.01)
+ leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18)
+ for obj in leg.legend_handles:
+ obj.set_linewidth(3.0)
+ obj.set_markersize(15.0)
+ obj.set_markeredgewidth(3.0)
+ plt.xlabel(x_name, fontsize=20)
+ plt.ylabel(y_name, fontsize=20)
+ plt.title(title, fontsize=20)
+ plt.savefig(fname="{}/{}.pdf".format(graphDir, gemvFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight")
+ plt.close('all')
+ print("\tPDF made")
+
+
+print("Finished!")
+# ---------------------------------------------------------------------------------------
+
+# ------------------------------ SpMDnV Graphs --------------------------------------------
+print("Creating SpMDnV graphs...")
+# Create GEMV graphs
+spmdnvFilenames = []
+for i in range(0, len(filenames)):
+ if "spmdnv_" in filenames[i]:
+ spmdnvFilenames.append(filenames[i])
+
+### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s
+for i in range(0, len(spmdnvFilenames)):
+ mn = []
+ iters = 0
+ kernel = ""
+ cpu_Gflops = []
+ gpuO_Gflops = []
+ gpuA_Gflops = []
+ gpuU_Gflops = []
+
+ # Open file and get all lines
+ fName = os.path.join(os.getcwd(), directory, spmdnvFilenames[i])
+ openFile = open(fName, 'r')
+ lines = openFile.readlines()
+ lines.pop(0) # Remove headers
+ if len(lines) == 0 :
+ continue
+
+ # Get number of iterations performed and kernel name
+ line1 = lines[0].split(',')
+ iters = int(line1[7])
+ kernel = line1[1]
+
+ # Get gflops (y-axis) and MN values (x-axis) for CPU and all GPU types
+ for line in lines:
+ line = line.split(',')
+ # Get MN
+ if (len(mn) == 0) or ([line[2], line[3]] not in mn):
+ mn.append([line[2], line[3]]) # line[2] = M, line[3] = N
+ # Get Gflops
+ gflops = float(line[-1].rstrip())
+ if line[0] == "cpu":
+ cpu_Gflops.append(gflops)
+ elif line[0] == "gpu_offloadOnce":
+ gpuO_Gflops.append(gflops)
+ elif line[0] == "gpu_offloadAlways":
+ gpuA_Gflops.append(gflops)
+ elif line[0] == "gpu_unified":
+ gpuU_Gflops.append(gflops)
+
+
+ # Create x-axis label and tick values
+ inputTypeStr = ""
+ x_name = ""
+ xVals = []
+ if "_square_vector_M=N" in spmdnvFilenames[i]:
+ x_name = "Value of M, N"
+ inputTypeStr = "Square x Vector (M=N)"
+ for j in range(0, len(mn)):
+ xVals.append(mn[j][0])
+ elif "_tall-thin_vector_M=16N" in spmdnvFilenames[i]:
+ x_name = "Value of N where M=16N"
+ inputTypeStr = "Tall-Thin x Vector (M=16N)"
+ for j in range(0, len(mn)):
+ xVals.append(mn[j][1])
+ elif "_tall-thin_vector_M_N=32" in spmdnvFilenames[i]:
+ x_name = "Value of M, where N=32"
+ inputTypeStr = "Tall-Thin x Vector (M, N=32)"
+ for j in range(0, len(mn)):
+ xVals.append(mn[j][0])
+ elif "_short-wide_vector_N=16M" in spmdnvFilenames[i]:
+ x_name = "Value of M, where N=16M"
+ inputTypeStr = "Short-Wide x Vector (N=16M)"
+ for j in range(0, len(mn)):
+ xVals.append(mn[j][0])
+ elif "_short-wide_vector_M=32_N" in spmdnvFilenames[i]:
+ x_name = "Value of N, where M=32"
+ inputTypeStr = "Short-Wide x Vector (M=32, N)"
+ for j in range(0, len(mn)):
+ xVals.append(mn[j][1])
+ else:
+ # File not supported so go to next file
+ continue
+
+ # Create y-axis label & graph title
+ y_name = ""
+ title = ""
+ fp = ""
+ if kernel == "sspmdnv" :
+ fp = "FP32"
+ elif kernel == "dspmdnv":
+ fp = "FP64"
+ y_name = "{} GFLOP/s".format(fp)
+ title = "{}SpMDnV Performance for {} Problems - {} iterations per problem size".format(kernel[0].upper(), inputTypeStr, iters)
+
+ # Make Graph
+ fig1 = plt.figure(figsize=(28,16))
+ ax1 = fig1.add_subplot()
+
+ gpuEnabled = False
+ if len(cpu_Gflops) > 0:
+ ax1.plot(xVals, cpu_Gflops, color="#332288", marker=".", label="CPU")
+ # Plot line at max GFLOP/s
+ yCoord = round(max(cpu_Gflops),1)
+ ax1.axhline(yCoord, color='black', linestyle='--')
+ ax1.text(x=0, y=yCoord, s="Max CPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+ if len(gpuO_Gflops) > 0:
+ ax1.plot(xVals, gpuO_Gflops, color="#44AA99", marker="x", label="GPU (Offload Once)")
+ gpuEnabled = True
+ if len(gpuA_Gflops) > 0:
+ ax1.plot(xVals, gpuA_Gflops, color="#CC6677", marker="+", label="GPU (Offload Always)")
+ gpuEnabled = True
+ if len(gpuU_Gflops) > 0:
+ ax1.plot(xVals, gpuU_Gflops, color="#DDCC77", marker=">", label="GPU (Unified Memory)")
+ gpuEnabled = True
+
+ if(gpuEnabled):
+ yCoord = round(max([max(gpuO_Gflops), max(gpuA_Gflops), max(gpuU_Gflops)]) ,1)
+ ax1.axhline(yCoord, color='black', linestyle='--')
+ ax1.text(x=0, y=yCoord, s="Max GPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+
+ # Set X ticks
+ NUM_TICK = 8
+ numXVals = len(xVals)
+ if numXVals < NUM_TICK:
+ # Print all labels
+ plt.xticks(ticks=range(0, numXVals, 1), labels=xVals, fontsize=20)
+ else:
+ # Calculate labels
+ locInterval = int((numXVals) / (NUM_TICK-1))
+ tickLocs = [0]
+ for q in range(1, (NUM_TICK-1)):
+ tickLocs.append(1 + (locInterval * q))
+ tickLocs.append(numXVals - 1)
+
+ labelInterval = int((int(xVals[-1]) - int(xVals[0])) / (NUM_TICK-1))
+ tickLabs = [xVals[0]]
+ for q in range(1, (NUM_TICK-1)):
+ tickLabs.append(int(xVals[0]) + (labelInterval * q))
+ tickLabs.append(int(xVals[-1]))
+
+ plt.xticks(ticks=tickLocs, labels=tickLabs, fontsize=20)
+
+ # Force setting of y-axis labels. If this isn't done then the range is weird...
+ yLoc, yLab = plt.yticks()
+ yLoc = yLoc.tolist()
+ # Remove negative first element of the list
+ if yLoc[0] != 0:
+ yLoc = yLoc[1:]
+ plt.ylim(0, yLoc[-1])
+ plt.yticks(ticks=yLoc, fontsize=20)
+
+ plt.margins(x=0.01, y=0.01)
+ leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18)
+ for obj in leg.legend_handles:
+ obj.set_linewidth(3.0)
+ obj.set_markersize(15.0)
+ obj.set_markeredgewidth(3.0)
+ plt.xlabel(x_name, fontsize=20)
+ plt.ylabel(y_name, fontsize=20)
+ plt.title(title, fontsize=20)
+ plt.savefig(fname="{}/{}.pdf".format(graphDir, spmdnvFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight")
+ plt.close('all')
+ print("\tPDF made")
+
+
+print("Finished!")
+# ---------------------------------------------------------------------------------------
+
# ------------------------------ GEMM Graphs --------------------------------------------
print("Creating GEMM graphs...")
# Create GEMM graphs
gemmFilenames = []
for i in range(0, len(filenames)):
- if "gemm_" in filenames[i]:
+ if "gemm_" in filenames[i] and "spgemm_" not in filenames[i]:
gemmFilenames.append(filenames[i])
### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s
@@ -54,7 +388,8 @@
# Get number of iterations performed and kernel name
line1 = lines[0].split(',')
- iters = int(line1[6])
+ sparsity = float(line1[6])
+ iters = int(line1[7])
kernel = line1[1]
# Get gflops (y-axis) and MNK values (x-axis) for CPU and all GPU types
@@ -127,8 +462,6 @@
# File not supported so go to next file
continue
-
-
# Create y-axis label & graph title
y_name = ""
title = ""
@@ -138,7 +471,9 @@
elif kernel == "dgemm":
fp = "FP64"
y_name = "{} GFLOP/s".format(fp)
- title = "{}GEMM Performance for {} Problems - {} iterations per problem size".format(kernel[0].upper(), inputTypeStr, iters)
+ title = ("{}GEMM Performance for {} Problems (sparsity = {})- {} "
+ "iterations per problemize").format(kernel[0].upper(),
+ inputTypeStr, sparsity, iters)
# Make Graph
fig1 = plt.figure(figsize=(28,16))
@@ -199,31 +534,32 @@
plt.margins(x=0.01, y=0.01)
leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18)
- for obj in leg.legendHandles:
+ for obj in leg.legend_handles:
obj.set_linewidth(3.0)
obj.set_markersize(15.0)
obj.set_markeredgewidth(3.0)
plt.xlabel(x_name, fontsize=20)
plt.ylabel(y_name, fontsize=20)
plt.title(title, fontsize=20)
- plt.savefig(fname="{}/{}.png".format(graphDir, gemmFilenames[i][:-4]), format="png", dpi=100, bbox_inches="tight")
+ plt.savefig(fname="{}/{}.pdf".format(graphDir, gemmFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight")
plt.close('all')
+ print("\tPDF made")
print("Finished!")
# ---------------------------------------------------------------------------------------
-# ------------------------------ GEMV Graphs --------------------------------------------
-print("Creating GEMV graphs...")
-# Create GEMV graphs
-gemvFilenames = []
+# ------------------------------ SpGEMM Graphs --------------------------------------------
+print("Creating SpMDnM graphs...")
+# Create SpMDnM graphs
+spmdnmFilenames = []
for i in range(0, len(filenames)):
- if "gemv_" in filenames[i]:
- gemvFilenames.append(filenames[i])
+ if "spmdnm_" in filenames[i]:
+ spmdnmFilenames.append(filenames[i])
### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s
-for i in range(0, len(gemvFilenames)):
- mn = []
+for i in range(0, len(spmdnmFilenames)):
+ mnk = []
iters = 0
kernel = ""
cpu_Gflops = []
@@ -232,7 +568,7 @@
gpuU_Gflops = []
# Open file and get all lines
- fName = os.path.join(os.getcwd(), directory, gemvFilenames[i])
+ fName = os.path.join(os.getcwd(), directory, spmdnmFilenames[i])
openFile = open(fName, 'r')
lines = openFile.readlines()
lines.pop(0) # Remove headers
@@ -241,15 +577,16 @@
# Get number of iterations performed and kernel name
line1 = lines[0].split(',')
- iters = int(line1[6])
+ sparsity = float(line1[6])
+ iters = int(line1[7])
kernel = line1[1]
- # Get gflops (y-axis) and MN values (x-axis) for CPU and all GPU types
+ # Get gflops (y-axis) and MNK values (x-axis) for CPU and all GPU types
for line in lines:
line = line.split(',')
- # Get MN
- if (len(mn) == 0) or ([line[2], line[3]] not in mn):
- mn.append([line[2], line[3]])
+ # Get MNK
+ if (len(mnk) == 0) or ([line[2], line[3], line[4]] not in mnk):
+ mnk.append([line[2], line[3], line[4]])
# Get Gflops
gflops = float(line[-1].rstrip())
if line[0] == "cpu":
@@ -261,52 +598,260 @@
elif line[0] == "gpu_unified":
gpuU_Gflops.append(gflops)
-
# Create x-axis label and tick values
inputTypeStr = ""
x_name = ""
xVals = []
- if "_square_vector_M=N" in gemvFilenames[i]:
- x_name = "Value of M, N"
- inputTypeStr = "Square x Vector (M=N)"
- for j in range(0, len(mn)):
- xVals.append(mn[j][0])
- elif "_tall-thin_vector_M=16N" in gemvFilenames[i]:
- x_name = "Value of N where M=16N"
- inputTypeStr = "Tall-Thin x Vector (M=16N)"
- for j in range(0, len(mn)):
- xVals.append(mn[j][1])
- elif "_tall-thin_vector_M_N=32" in gemvFilenames[i]:
- x_name = "Value of M, where N=32"
- inputTypeStr = "Tall-Thin x Vector (M, N=32)"
- for j in range(0, len(mn)):
- xVals.append(mn[j][0])
- elif "_short-wide_vector_N=16M" in gemvFilenames[i]:
- x_name = "Value of M, where N=16M"
- inputTypeStr = "Short-Wide x Vector (N=16M)"
- for j in range(0, len(mn)):
- xVals.append(mn[j][0])
- elif "_short-wide_vector_M=32_N" in gemvFilenames[i]:
- x_name = "Value of N, where M=32"
- inputTypeStr = "Short-Wide x Vector (M=32, N)"
- for j in range(0, len(mn)):
- xVals.append(mn[j][1])
+ if "_square_square_M=N=K" in spmdnmFilenames[i]:
+ x_name = "Value of M, N, K"
+ inputTypeStr = "Square x Square (M=N=K)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][0])
+ elif "_tall-thin_short-wide_M=N_M=16K" in spmdnmFilenames[i]:
+ x_name = "Value of K where M=16K and N=16K"
+ inputTypeStr = "Tall-Thin x Short-Wide (M=N=16K)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][2])
+ elif "_tall-thin_short-wide_M=N_K=32" in spmdnmFilenames[i]:
+ x_name = "Value of M and N, where K=32"
+ inputTypeStr = "Tall-Thin x Short-Wide (M=N, K=32)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][0])
+ elif "_short-wide_tall-thin_M=N_K=16M" in spmdnmFilenames[i]:
+ x_name = "Value of M and N, where K=16M"
+ inputTypeStr = "Short-Wide x Tall-Thin (M=N, K=16M)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][0])
+ elif "_short-wide_tall-thin_M=N=32_K" in spmdnmFilenames[i]:
+ x_name = "Value of K, where M=32 and N=32"
+ inputTypeStr = "Short-Wide x Tall-Thin (M=N=32, K)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][2])
+ elif "_tall-thin_square_K=N_M=16K" in spmdnmFilenames[i]:
+ x_name = "Value of N and K, where M=16K"
+ inputTypeStr = "Tall-Thin x Square (N=K, M=16K)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][2])
+ elif "_tall-thin_square_K=N=32_M" in spmdnmFilenames[i]:
+ x_name = "Value of M, where N=32 and K=32"
+ inputTypeStr = "Tall-Thin x Square (M, N=K=32)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][0])
+ elif "_square_short-wide_M=K_N=16K" in spmdnmFilenames[i]:
+ x_name = "Value of M and K, where N=16K"
+ inputTypeStr = "Square x Short-Wide (M=K, N=16K)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][0])
+ elif "_square_short-wide_M=K=32_N" in spmdnmFilenames[i]:
+ x_name = "Value of N, where M=32 and K=32"
+ inputTypeStr = "Square x Short-Wide (M=K=32, N)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][1])
else:
# File not supported so go to next file
continue
+ # Create y-axis label & graph title
+ y_name = ""
+ title = ""
+ fp = ""
+ if kernel == "sspmdnm" :
+ fp = "FP32"
+ elif kernel == "dspmdnm":
+ fp = "FP64"
+ y_name = "{} GFLOP/s".format(fp)
+ title = ("{}SpMDnM Performance for {} Problems (sparsity = {})- {} "
+ "iterations per problemize").format(kernel[0].upper(),
+ inputTypeStr, sparsity, iters)
+
+ # Make Graph
+ fig1 = plt.figure(figsize=(28,16))
+ ax1 = fig1.add_subplot()
+
+ gpuEnabled = False
+ if len(cpu_Gflops) > 0:
+ ax1.plot(xVals, cpu_Gflops, color="#332288", marker=".", label="CPU")
+ # Plot line at max GFLOP/s
+ yCoord = round(max(cpu_Gflops),1)
+ ax1.axhline(yCoord, color='black', linestyle='--')
+ ax1.text(x=0, y=yCoord, s="Max CPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+ if len(gpuO_Gflops) > 0:
+ ax1.plot(xVals, gpuO_Gflops, color="#44AA99", marker="x", label="GPU (Offload Once)")
+ gpuEnabled = True
+ if len(gpuA_Gflops) > 0:
+ ax1.plot(xVals, gpuA_Gflops, color="#CC6677", marker="+", label="GPU (Offload Always)")
+ gpuEnabled = True
+ if len(gpuU_Gflops) > 0:
+ ax1.plot(xVals, gpuU_Gflops, color="#DDCC77", marker=">", label="GPU (Unified Memory)")
+ gpuEnabled = True
+
+ if(gpuEnabled):
+ yCoord = round(max([max(gpuO_Gflops), max(gpuA_Gflops), max(gpuU_Gflops)]) ,1)
+ ax1.axhline(yCoord, color='black', linestyle='--')
+ ax1.text(x=0, y=yCoord, s="Max GPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom')
+
+ # Set X ticks
+ NUM_TICK = 8
+ numXVals = len(xVals)
+ if numXVals < NUM_TICK:
+ # Print all labels
+ plt.xticks(ticks=range(0, numXVals, 1), labels=xVals, fontsize=20)
+ else:
+ # Calculate labels
+ locInterval = int((numXVals) / (NUM_TICK-1))
+ tickLocs = [0]
+ for q in range(1, (NUM_TICK-1)):
+ tickLocs.append(1 + (locInterval * q))
+ tickLocs.append(numXVals - 1)
+
+ labelInterval = int((int(xVals[-1]) - int(xVals[0])) / (NUM_TICK-1))
+ tickLabs = [xVals[0]]
+ for q in range(1, (NUM_TICK-1)):
+ tickLabs.append(int(xVals[0]) + (labelInterval * q))
+ tickLabs.append(int(xVals[-1]))
+
+ plt.xticks(ticks=tickLocs, labels=tickLabs, fontsize=20)
+
+ # Force setting of y-axis labels. If this isn't done then the range is weird...
+ yLoc, yLab = plt.yticks()
+ yLoc = yLoc.tolist()
+ # Remove negative first element of the list
+ if yLoc[0] != 0:
+ yLoc = yLoc[1:]
+ plt.ylim(0, yLoc[-1])
+ plt.yticks(ticks=yLoc, fontsize=20)
+
+ plt.margins(x=0.01, y=0.01)
+ leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18)
+ for obj in leg.legend_handles:
+ obj.set_linewidth(3.0)
+ obj.set_markersize(15.0)
+ obj.set_markeredgewidth(3.0)
+ plt.xlabel(x_name, fontsize=20)
+ plt.ylabel(y_name, fontsize=20)
+ plt.title(title, fontsize=20)
+ plt.savefig(fname="{}/{}.pdf".format(graphDir, spmdnmFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight")
+ plt.close('all')
+ print("\tPDF made")
+
+
+print("Finished!")
+# ---------------------------------------------------------------------------------------
+
+# ------------------------------ SpMSpM Graphs --------------------------------------------
+print("Creating SpMSpM graphs...")
+# Create SpMSpM graphs
+spmspmFilenames = []
+for i in range(0, len(filenames)):
+ if "spmspm_" in filenames[i]:
+ spmspmFilenames.append(filenames[i])
+
+### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s
+for i in range(0, len(spmspmFilenames)):
+ mnk = []
+ iters = 0
+ kernel = ""
+ cpu_Gflops = []
+ gpuO_Gflops = []
+ gpuA_Gflops = []
+ gpuU_Gflops = []
+
+ # Open file and get all lines
+ fName = os.path.join(os.getcwd(), directory, spmspmFilenames[i])
+ openFile = open(fName, 'r')
+ lines = openFile.readlines()
+ lines.pop(0) # Remove headers
+ if len(lines) == 0 :
+ continue
+
+ # Get number of iterations performed and kernel name
+ line1 = lines[0].split(',')
+ sparsity = float(line1[6])
+ iters = int(line1[7])
+ kernel = line1[1]
+
+ # Get gflops (y-axis) and MNK values (x-axis) for CPU and all GPU types
+ for line in lines:
+ line = line.split(',')
+ # Get MNK
+ if (len(mnk) == 0) or ([line[2], line[3], line[4]] not in mnk):
+ mnk.append([line[2], line[3], line[4]])
+ # Get Gflops
+ gflops = float(line[-1].rstrip())
+ if line[0] == "cpu":
+ cpu_Gflops.append(gflops)
+ elif line[0] == "gpu_offloadOnce":
+ gpuO_Gflops.append(gflops)
+ elif line[0] == "gpu_offloadAlways":
+ gpuA_Gflops.append(gflops)
+ elif line[0] == "gpu_unified":
+ gpuU_Gflops.append(gflops)
+ # Create x-axis label and tick values
+ inputTypeStr = ""
+ x_name = ""
+ xVals = []
+ if "_square_square_M=N=K" in spmspmFilenames[i]:
+ x_name = "Value of M, N, K"
+ inputTypeStr = "Square x Square (M=N=K)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][0])
+ elif "_tall-thin_short-wide_M=N_M=16K" in spmspmFilenames[i]:
+ x_name = "Value of K where M=16K and N=16K"
+ inputTypeStr = "Tall-Thin x Short-Wide (M=N=16K)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][2])
+ elif "_tall-thin_short-wide_M=N_K=32" in spmspmFilenames[i]:
+ x_name = "Value of M and N, where K=32"
+ inputTypeStr = "Tall-Thin x Short-Wide (M=N, K=32)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][0])
+ elif "_short-wide_tall-thin_M=N_K=16M" in spmspmFilenames[i]:
+ x_name = "Value of M and N, where K=16M"
+ inputTypeStr = "Short-Wide x Tall-Thin (M=N, K=16M)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][0])
+ elif "_short-wide_tall-thin_M=N=32_K" in spmspmFilenames[i]:
+ x_name = "Value of K, where M=32 and N=32"
+ inputTypeStr = "Short-Wide x Tall-Thin (M=N=32, K)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][2])
+ elif "_tall-thin_square_K=N_M=16K" in spmspmFilenames[i]:
+ x_name = "Value of N and K, where M=16K"
+ inputTypeStr = "Tall-Thin x Square (N=K, M=16K)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][2])
+ elif "_tall-thin_square_K=N=32_M" in spmspmFilenames[i]:
+ x_name = "Value of M, where N=32 and K=32"
+ inputTypeStr = "Tall-Thin x Square (M, N=K=32)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][0])
+ elif "_square_short-wide_M=K_N=16K" in spmspmFilenames[i]:
+ x_name = "Value of M and K, where N=16K"
+ inputTypeStr = "Square x Short-Wide (M=K, N=16K)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][0])
+ elif "_square_short-wide_M=K=32_N" in spmspmFilenames[i]:
+ x_name = "Value of N, where M=32 and K=32"
+ inputTypeStr = "Square x Short-Wide (M=K=32, N)"
+ for j in range(0, len(mnk)):
+ xVals.append(mnk[j][1])
+ else:
+ # File not supported so go to next file
+ continue
# Create y-axis label & graph title
y_name = ""
title = ""
fp = ""
- if kernel == "sgemv" :
+ if kernel == "sspmspm" :
fp = "FP32"
- elif kernel == "dgemv":
+ elif kernel == "dspmspm":
fp = "FP64"
y_name = "{} GFLOP/s".format(fp)
- title = "{}GEMV Performance for {} Problems - {} iterations per problem size".format(kernel[0].upper(), inputTypeStr, iters)
+ title = ("{}SpMSpM Performance for {} Problems (sparsity = {})- {} "
+ "iterations per problemize").format(kernel[0].upper(),
+ inputTypeStr, sparsity, iters)
# Make Graph
fig1 = plt.figure(figsize=(28,16))
@@ -367,16 +912,17 @@
plt.margins(x=0.01, y=0.01)
leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18)
- for obj in leg.legendHandles:
+ for obj in leg.legend_handles:
obj.set_linewidth(3.0)
obj.set_markersize(15.0)
obj.set_markeredgewidth(3.0)
plt.xlabel(x_name, fontsize=20)
plt.ylabel(y_name, fontsize=20)
plt.title(title, fontsize=20)
- plt.savefig(fname="{}/{}.png".format(graphDir, gemvFilenames[i][:-4]), format="png", dpi=100, bbox_inches="tight")
+ plt.savefig(fname="{}/{}.pdf".format(graphDir, spmspmFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight")
plt.close('all')
+ print("\tPDF made")
print("Finished!")
-# ---------------------------------------------------------------------------------------
\ No newline at end of file
+# ---------------------------------------------------------------------------------------
diff --git a/cuBLAS/common.hh b/cuBLAS/common.hh
index 78d0270..af222fb 100644
--- a/cuBLAS/common.hh
+++ b/cuBLAS/common.hh
@@ -2,24 +2,45 @@
#if defined GPU_CUBLAS
+#include
+#include
+#include
+
+/** Macro function to check if error occurred when calling cuBLAS. */
/** Macro function to check if error occurred when calling CUDA. */
-#define cudaCheckError(f) \
- do { \
- if (cudaError_t e = (f); e != cudaSuccess) { \
- std::cout << "CUDA error: " << __FILE__ << ":" << __LINE__ << ": " \
- << cudaGetErrorString(e) << std::endl; \
- exit(1); \
- } \
+#define cudaCheckError(f) \
+ do { \
+ if (cudaError_t e = (f); e != cudaSuccess) { \
+ std::cout << "CUDA error: " << __FILE__ << ":" << __LINE__ << ": "; \
+ std::cout << cudaGetErrorString(e) << std::endl; \
+ exit(1); \
+ } \
} while (false)
/** Macro function to check if error occurred when calling cuBLAS. */
-#define cublasCheckError(f) \
- do { \
- if (cublasStatus_t e = (f); e != CUBLAS_STATUS_SUCCESS) { \
- std::cout << "CUBLAS error: " << __FILE__ << ":" << __LINE__ << ": " \
- << cublasGetStatusString(e) << std::endl; \
- exit(1); \
- } \
+#define cublasCheckError(f) \
+ do { \
+ cublasStatus_t status = (f); \
+ if (status != CUBLAS_STATUS_SUCCESS) { \
+ std::cout << "CUBLAS error: " << __FILE__ << ":" << __LINE__ << ": "; \
+ std::cout << cublasGetStatusName(status) << " - "; \
+ std::cout << cublasGetStatusString(status) << std::endl; \
+ exit(1); \
+ } \
} while (false)
-#endif
\ No newline at end of file
+/** Macro function to check if error occurred when calling cuSPARSE. */
+#define cusparseCheckError(f) \
+ do { \
+ cusparseStatus_t status = (f); \
+ if (status != CUSPARSE_STATUS_SUCCESS) { \
+ std::cout << "CUSPARSE error: " << __FILE__ << ":" << __LINE__ << ": "; \
+ std::cout << cusparseGetErrorName(status) << " - "; \
+ std::cout << cusparseGetErrorString(status) << std::endl; \
+ exit(1); \
+ } \
+ } while (false) \
+
+#endif
+
+
diff --git a/cuBLAS/spmdnm.hh b/cuBLAS/spmdnm.hh
new file mode 100644
index 0000000..bb08d90
--- /dev/null
+++ b/cuBLAS/spmdnm.hh
@@ -0,0 +1,552 @@
+#pragma once
+
+#ifdef GPU_CUBLAS
+#include
+#include
+#include
+#include
+#include
+
+#include "../include/kernels/GPU/spmdnm.hh"
+#include "../include/utilities.hh"
+#include "common.hh"
+
+namespace gpu {
+ /**
+ * A class for sparse matrix-dense matrix BLAS
+ */
+template
+class spmdnm_gpu : public spmdnm {
+public:
+ using spmdnm::spmdnm;
+ using spmdnm::initInputMatrices;
+ using spmdnm::m_;
+ using spmdnm::n_;
+ using spmdnm::k_;
+ using spmdnm::B_;
+ using spmdnm::C_;
+ using spmdnm::offload_;
+ using spmdnm::nnz_;
+ using spmdnm::sparsity_;
+ using spmdnm::type_;
+
+ ~spmdnm_gpu() {
+ if (alreadyInitialised_) {
+ cusparseCheckError(cusparseDestroy(handle_));
+
+ cudaCheckError(cudaStreamDestroy(stream1_));
+ cudaCheckError(cudaStreamDestroy(stream2_));
+ cudaCheckError(cudaStreamDestroy(stream3_));
+ cudaCheckError(cudaStreamDestroy(stream4_));
+ cudaCheckError(cudaStreamDestroy(stream5_));
+
+ alreadyInitialised_ = false;
+ }
+ }
+
+ void initialise(gpuOffloadType offload, int m, int n, int k,
+ double sparsity, matrixType type, bool binary = false) override {
+ if (!alreadyInitialised_) {
+ alreadyInitialised_ = true;
+ cusparseCheckError(cusparseCreate(&handle_));
+
+ cudaCheckError(cudaStreamCreate(&stream1_));
+ cudaCheckError(cudaStreamCreate(&stream2_));
+ cudaCheckError(cudaStreamCreate(&stream3_));
+ cudaCheckError(cudaStreamCreate(&stream4_));
+ cudaCheckError(cudaStreamCreate(&stream5_));
+
+ cusparseCheckError(cusparseSetStream(handle_, stream1_));
+
+ // Get device identifier
+ cudaCheckError(cudaGetDevice(&gpuDevice_));
+
+ }
+ offload_ = offload;
+ sparsity_ = sparsity;
+ type_ = type;
+
+ m_ = m;
+ n_ = n;
+ k_ = k;
+
+ B_ = C_ = B_dev_ = C_dev_ = A_vals_ = A_vals_dev_ = nullptr;
+ A_rows_ = A_cols_ = A_rows_dev_ = A_cols_dev_ = nullptr;
+ /** Determine the number of nnz elements in A and B */
+ nnz_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_));
+
+ // Set up cuSPARSE metadata
+ opA_ = CUSPARSE_OPERATION_NON_TRANSPOSE;
+ opB_ = CUSPARSE_OPERATION_NON_TRANSPOSE;
+ alg_ = CUSPARSE_SPMM_ALG_DEFAULT;
+ index_ = CUSPARSE_INDEX_64I;
+ base_ = CUSPARSE_INDEX_BASE_ZERO;
+ B_order_ = CUSPARSE_ORDER_ROW;
+ C_order_ = CUSPARSE_ORDER_ROW;
+ if (std::is_same_v) {
+ dataType_ = CUDA_R_32F;
+ } else if (std::is_same_v) {
+ dataType_ = CUDA_R_64F;
+ } else {
+ std::cerr << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl;
+ exit(1);
+ }
+
+ if (offload_ == gpuOffloadType::unified) {
+ cudaCheckError(cudaMallocManaged(&B_, sizeof(T) * k_ * n_));
+ cudaCheckError(cudaMallocManaged(&C_, sizeof(T) * m_ * n_));
+ } else {
+ B_ = (T*)malloc(sizeof(T) * k_ * n_);
+ C_ = (T*)malloc(sizeof(T) * m_ * n_);
+
+ cudaCheckError(cudaMalloc((void**)&B_dev_, sizeof(T) * k_ * n_));
+ cudaCheckError(cudaMalloc((void**)&C_dev_, sizeof(T) * m_ * n_));
+ }
+ cudaCheckError(cudaDeviceSynchronize());
+
+ initInputMatrices();
+ }
+
+protected:
+ void toSparseFormat() override {
+ if (offload_ == gpuOffloadType::always) {
+ A_vals_store_ = (T*)malloc(sizeof(T) * nnz_);
+ A_cols_store_ = (int64_t*)malloc(sizeof(int64_t) * nnz_);
+ A_rows_store_ = (int64_t*)malloc(sizeof(int64_t) * (m_ + 1));
+
+ if (type_ == matrixType::rmat) {
+ rMatCSR(A_vals_store_, A_cols_store_, A_rows_store_, m_, k_, nnz_);
+ } else if (type_ == matrixType::random) {
+ randomCSR(A_vals_store_, A_cols_store_, A_rows_store_, m_, k_, nnz_);
+ } else if (type_ == matrixType::finiteElements) {
+ finiteElementCSR(A_vals_store_, A_cols_store_, A_rows_store_, m_, k_, nnz_);
+ } else {
+ exit(1);
+ }
+ }
+
+ // Allocate CSR arrays
+ if (offload_ == gpuOffloadType::unified) {
+ cudaCheckError(cudaMallocManaged(&A_vals_, nnz_ * sizeof(T)));
+ cudaCheckError(cudaMallocManaged(&A_cols_, nnz_ * sizeof(int64_t)));
+ cudaCheckError(cudaMallocManaged(&A_rows_, (m_ + 1) * sizeof(int64_t)));
+ } else {
+ A_vals_ = (T*)malloc(nnz_ * sizeof(T));
+ A_cols_ = (int64_t*)malloc(nnz_ * sizeof(int64_t));
+ A_rows_ = (int64_t*)malloc((m_ + 1) * sizeof(int64_t));
+ cudaCheckError(cudaMalloc((void**)&A_vals_dev_, nnz_ * sizeof(T)));
+ cudaCheckError(cudaMalloc((void**)&A_cols_dev_, nnz_ * sizeof(int64_t)));
+ cudaCheckError(cudaMalloc((void**)&A_rows_dev_, (m_ + 1) * sizeof(int64_t)));
+ }
+ cudaCheckError(cudaDeviceSynchronize());
+
+ memcpy(A_vals_, A_vals_store_, sizeof(T) * nnz_);
+ memcpy(A_cols_, A_cols_store_, sizeof(int64_t) * nnz_);
+ memcpy(A_rows_, A_rows_store_, sizeof(int64_t) * (m_ + 1));
+ cudaCheckError(cudaDeviceSynchronize());
+ }
+
+private:
+ void preLoopRequirements() override {
+ switch(offload_) {
+ case gpuOffloadType::always: {
+ break;
+ }
+ case gpuOffloadType::once: {
+ cudaCheckError(cudaMemcpyAsync(A_vals_dev_, A_vals_, nnz_ * sizeof(T), cudaMemcpyHostToDevice, stream1_));
+ cudaCheckError(cudaMemcpyAsync(A_cols_dev_, A_cols_, nnz_ * sizeof(int64_t), cudaMemcpyHostToDevice, stream2_));
+ cudaCheckError(cudaMemcpyAsync(A_rows_dev_, A_rows_, (m_ + 1) * sizeof(int64_t), cudaMemcpyHostToDevice, stream3_));
+ cudaCheckError(cudaMemcpyAsync(B_dev_, B_, (k_ * n_) * sizeof(T), cudaMemcpyHostToDevice, stream4_));
+ cudaCheckError(cudaMemcpyAsync(C_dev_, C_, (m_ * n_) * sizeof(T), cudaMemcpyHostToDevice, stream5_));
+ break;
+ }
+ case gpuOffloadType::unified: {
+ cudaCheckError(cudaMemPrefetchAsync(A_vals_, nnz_ * sizeof(T), gpuDevice_, stream1_));
+ cudaCheckError(cudaMemPrefetchAsync(A_cols_, nnz_ * sizeof(int64_t), gpuDevice_, stream2_));
+ cudaCheckError(cudaMemPrefetchAsync(A_rows_, (m_ + 1) * sizeof(int64_t), gpuDevice_, stream3_));
+ cudaCheckError(cudaMemPrefetchAsync(B_, (n_ * k_) * sizeof(T), gpuDevice_, stream4_));
+ cudaCheckError(cudaMemPrefetchAsync(C_, (m_ * n_) * sizeof(T), gpuDevice_, stream5_));
+ cudaCheckError(cudaDeviceSynchronize());
+ break;
+ }
+ }
+ }
+
+ void callSpmdnm() override {
+ switch(offload_) {
+ case gpuOffloadType::always: {
+ // Move over data
+ cudaCheckError(cudaMemcpyAsync(A_vals_dev_, A_vals_, nnz_ * sizeof(T), cudaMemcpyHostToDevice, stream1_));
+ cudaCheckError(cudaMemcpyAsync(A_cols_dev_, A_cols_, nnz_ * sizeof(int64_t), cudaMemcpyHostToDevice, stream2_));
+ cudaCheckError(cudaMemcpyAsync(A_rows_dev_, A_rows_, (m_ + 1) * sizeof(int64_t), cudaMemcpyHostToDevice, stream3_));
+ cudaCheckError(cudaMemcpyAsync(B_dev_, B_, (k_ * n_) * sizeof(T), cudaMemcpyHostToDevice, stream4_));
+ cudaCheckError(cudaMemcpyAsync(C_dev_, C_, (m_ * n_) * sizeof(T), cudaMemcpyHostToDevice, stream5_));
+
+ // Set up descriptors
+ cusparseCheckError(cusparseCreateCsr(&A_descr_,
+ m_,
+ k_,
+ nnz_,
+ A_rows_dev_,
+ A_cols_dev_,
+ A_vals_dev_,
+ index_,
+ index_,
+ base_,
+ dataType_));
+ cusparseCheckError(cusparseCreateDnMat(&B_descr_,
+ k_,
+ n_,
+ n_,
+ B_dev_,
+ dataType_,
+ B_order_));
+ cusparseCheckError(cusparseCreateDnMat(&C_descr_,
+ m_,
+ n_,
+ n_,
+ C_dev_,
+ dataType_,
+ C_order_));
+
+ // Set up temporary buffers
+ void* dBuffer = nullptr;
+ size_t bufferSize = 0;
+
+ // Begin matrix-matrix multiplication
+ cusparseCheckError(cusparseSpMM_bufferSize(handle_,
+ opA_,
+ opB_,
+ &alpha,
+ A_descr_,
+ B_descr_,
+ &beta,
+ C_descr_,
+ dataType_,
+ alg_,
+ &bufferSize));
+
+ // Allocate the temporary buffer
+ cudaCheckError(cudaMalloc((void**)&dBuffer, bufferSize));
+
+ cusparseCheckError(cusparseSpMM_preprocess(handle_,
+ opA_,
+ opB_,
+ &alpha,
+ A_descr_,
+ B_descr_,
+ &beta,
+ C_descr_,
+ dataType_,
+ alg_,
+ dBuffer));
+
+ cusparseCheckError(cusparseSpMM(handle_,
+ opA_,
+ opB_,
+ &alpha,
+ A_descr_,
+ B_descr_,
+ &beta,
+ C_descr_,
+ dataType_,
+ alg_,
+ dBuffer));
+ cudaCheckError(cudaDeviceSynchronize());
+
+ // Clean up descriptors
+ cusparseCheckError(cusparseDestroySpMat(A_descr_));
+ cusparseCheckError(cusparseDestroyDnMat(B_descr_));
+ cusparseCheckError(cusparseDestroyDnMat(C_descr_));
+
+ // Free up the temporary buffer
+ cudaCheckError(cudaFree(dBuffer));
+
+ // Move result back to CPU
+ cudaCheckError(cudaMemcpyAsync(C_, C_dev_, (sizeof(T) * m_ * n_),
+ cudaMemcpyDeviceToHost, stream1_));
+ cudaCheckError(cudaDeviceSynchronize());
+ break;
+ }
+ case gpuOffloadType::once: {
+ // Set up descriptors
+ cusparseCheckError(cusparseCreateCsr(&A_descr_,
+ m_,
+ k_,
+ nnz_,
+ A_rows_dev_,
+ A_cols_dev_,
+ A_vals_dev_,
+ index_,
+ index_,
+ base_,
+ dataType_));
+ cusparseCheckError(cusparseCreateDnMat(&B_descr_,
+ k_,
+ n_,
+ n_,
+ B_dev_,
+ dataType_,
+ B_order_));
+ cusparseCheckError(cusparseCreateDnMat(&C_descr_,
+ m_,
+ n_,
+ n_,
+ C_dev_,
+ dataType_,
+ C_order_));
+
+ size_t bufferSize = 0;
+ // Begin matrix-matrix multiplication
+ cusparseCheckError(cusparseSpMM_bufferSize(handle_,
+ opA_,
+ opB_,
+ &alpha,
+ A_descr_,
+ B_descr_,
+ &beta,
+ C_descr_,
+ dataType_,
+ alg_,
+ &bufferSize));
+
+ // Allocate the temporary buffer
+ void* dBuffer = nullptr;
+ cudaCheckError(cudaMalloc((void**)&dBuffer, bufferSize));
+ cusparseCheckError(cusparseSpMM_preprocess(handle_,
+ opA_,
+ opB_,
+ &alpha,
+ A_descr_,
+ B_descr_,
+ &beta,
+ C_descr_,
+ dataType_,
+ alg_,
+ dBuffer));
+
+ cusparseCheckError(cusparseSpMM(handle_,
+ opA_,
+ opB_,
+ &alpha,
+ A_descr_,
+ B_descr_,
+ &beta,
+ C_descr_,
+ dataType_,
+ alg_,
+ dBuffer));
+
+ // Clean up descriptors
+ cusparseCheckError(cusparseDestroySpMat(A_descr_));
+ cusparseCheckError(cusparseDestroyDnMat(B_descr_));
+ cusparseCheckError(cusparseDestroyDnMat(C_descr_));
+
+ // Free up the temporary buffer
+ cudaCheckError(cudaFree(dBuffer));
+ }
+ case gpuOffloadType::unified: {
+ // Create descriptors for the matrices
+ cusparseCheckError(cusparseCreateCsr(&A_descr_,
+ m_,
+ k_,
+ nnz_,
+ A_rows_,
+ A_cols_,
+ A_vals_,
+ index_,
+ index_,
+ base_,
+ dataType_));
+ cusparseCheckError(cusparseCreateDnMat(&B_descr_,
+ k_,
+ n_,
+ n_,
+ B_,
+ dataType_,
+ B_order_));
+ cusparseCheckError(cusparseCreateDnMat(&C_descr_,
+ m_,
+ n_,
+ n_,
+ C_,
+ dataType_,
+ C_order_));
+
+ // Set up temporary buffers
+ void* dBuffer = nullptr;
+ size_t bufferSize = 0;
+
+ // Begin matrix-matrix multiplication
+ cusparseCheckError(cusparseSpMM_bufferSize(handle_,
+ opA_,
+ opB_,
+ &alpha,
+ A_descr_,
+ B_descr_,
+ &beta,
+ C_descr_,
+ dataType_,
+ alg_,
+ &bufferSize));
+
+ // Allocate the temporary buffer
+ cudaCheckError(cudaMalloc((void**)&dBuffer, bufferSize));
+
+ cusparseCheckError(cusparseSpMM_preprocess(handle_,
+ opA_,
+ opB_,
+ &alpha,
+ A_descr_,
+ B_descr_,
+ &beta,
+ C_descr_,
+ dataType_,
+ alg_,
+ dBuffer));
+ cudaCheckError(cudaDeviceSynchronize());
+
+ cusparseCheckError(cusparseSpMM(handle_,
+ opA_,
+ opB_,
+ &alpha,
+ A_descr_,
+ B_descr_,
+ &beta,
+ C_descr_,
+ dataType_,
+ alg_,
+ dBuffer));
+ cudaCheckError(cudaDeviceSynchronize());
+
+ // Clean up descriptors
+ cusparseCheckError(cusparseDestroySpMat(A_descr_));
+ cusparseCheckError(cusparseDestroyDnMat(B_descr_));
+ cusparseCheckError(cusparseDestroyDnMat(C_descr_));
+ cudaCheckError(cudaDeviceSynchronize());
+
+ // Free up the temporary buffer
+ cudaCheckError(cudaFree(dBuffer));
+ break;
+ }
+ }
+ }
+
+ void postLoopRequirements() override {
+ switch (offload_) {
+ case gpuOffloadType::always: {
+ break;
+ }
+ case gpuOffloadType::once: {
+ // Move result back to CPU
+ cudaCheckError(cudaMemcpyAsync(C_, C_dev_, (sizeof(T) * m_ * n_),
+ cudaMemcpyDeviceToHost, stream1_));
+ cudaCheckError(cudaDeviceSynchronize());
+ break;
+ }
+ case gpuOffloadType::unified: {
+ // Move result back to CPU
+ cudaCheckError(cudaMemPrefetchAsync(C_, sizeof(T) * m_ * n_,
+ cudaCpuDeviceId, stream1_));
+ cudaCheckError(cudaDeviceSynchronize());
+ break;
+ }
+ }
+ }
+
+ void postCallKernelCleanup() override {
+ if (offload_ == gpuOffloadType::unified) {
+ cudaCheckError(cudaFree(A_vals_));
+ cudaCheckError(cudaFree(A_cols_));
+ cudaCheckError(cudaFree(A_rows_));
+ cudaCheckError(cudaFree(B_));
+ cudaCheckError(cudaFree(C_));
+ free(A_vals_store_);
+ free(A_cols_store_);
+ free(A_rows_store_);
+ } else {
+ free(A_vals_);
+ free(A_cols_);
+ free(A_rows_);
+ free(B_);
+ free(C_);
+ cudaCheckError(cudaFree(A_vals_dev_));
+ cudaCheckError(cudaFree(A_cols_dev_));
+ cudaCheckError(cudaFree(A_rows_dev_));
+ cudaCheckError(cudaFree(B_dev_));
+ cudaCheckError(cudaFree(C_dev_));
+ }
+ }
+
+ bool alreadyInitialised_ = false;
+
+ /** Handle used when calling cuBLAS. */
+ cusparseHandle_t handle_;
+
+ /** CUDA Streams - used to asynchronously move data between host and device. */
+ cudaStream_t stream1_;
+ cudaStream_t stream2_;
+ cudaStream_t stream3_;
+ cudaStream_t stream4_;
+ cudaStream_t stream5_;
+
+ /** The ID of the target GPU Device. */
+ int gpuDevice_;
+
+ /** The constant value Alpha. */
+ const T alpha = ALPHA;
+
+ /** The constant value Beta. */
+ const T beta = BETA;
+
+ // cuSPARSE parameters
+ cusparseOperation_t opA_;
+ cusparseOperation_t opB_;
+ cusparseSpMMAlg_t alg_;
+ cusparseIndexType_t index_;
+ cusparseIndexBase_t base_;
+ cudaDataType_t dataType_;
+
+ /**
+ * ___________ Host data ______________
+ */
+ /** CSR format vectors for matrix A */
+ cusparseSpMatDescr_t A_descr_;
+ T* A_vals_;
+ int64_t* A_cols_;
+ int64_t* A_rows_;
+ int64_t A_num_rows_;
+ int64_t A_num_cols_;
+
+ /** dense format values for matrices B and C */
+ cusparseDnMatDescr_t B_descr_;
+ int64_t B_num_rows_;
+ int64_t B_num_cols_;
+ int64_t B_leading_dim_;
+ cusparseOrder_t B_order_;
+
+ cusparseDnMatDescr_t C_descr_;
+ int64_t C_num_rows_;
+ int64_t C_num_cols_;
+ int64_t C_leading_dim_;
+ cusparseOrder_t C_order_;
+
+ /**
+ * _____________ Device data ________________
+ */
+ T* A_vals_dev_;
+ int64_t* A_cols_dev_;
+ int64_t* A_rows_dev_;
+
+ T* B_dev_;
+
+ T* C_dev_;
+
+ T* A_vals_store_;
+ int64_t* A_cols_store_;
+ int64_t* A_rows_store_;
+};
+};
+
+
+#endif
diff --git a/cuBLAS/spmdnv.hh b/cuBLAS/spmdnv.hh
new file mode 100644
index 0000000..4d0317e
--- /dev/null
+++ b/cuBLAS/spmdnv.hh
@@ -0,0 +1,484 @@
+#pragma once
+
+#ifdef GPU_CUBLAS
+#include
+#include
+#include
+#include
+#include
+
+#include "../include/kernels/GPU/spmdnv.hh"
+#include "../include/utilities.hh"
+#include "common.hh"
+
+namespace gpu {
+/** A class for SpMDnV GPU BLAS kernels. */
+template
+class spmdnv_gpu : public spmdnv {
+ public:
+ using spmdnv::spmdnv;
+ using spmdnv::initInputMatrixVector;
+ using spmdnv::nnz_;
+ using spmdnv::m_;
+ using spmdnv::n_;
+ using spmdnv::x_;
+ using spmdnv::y_;
+ using spmdnv::offload_;
+ using spmdnv