diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..d5e0157 Binary files /dev/null and b/.DS_Store differ diff --git a/.clang-format b/.clang-format index 2aec894..276a9db 100644 --- a/.clang-format +++ b/.clang-format @@ -2,4 +2,5 @@ BasedOnStyle: Google DerivePointerAlignment: false PointerAlignment: true -Standard: C++11 \ No newline at end of file +AlignAfterOpenBracket: BlockIndent +Standard: C++20 \ No newline at end of file diff --git a/.gitignore b/.gitignore index da20b26..c2b88e6 100644 --- a/.gitignore +++ b/.gitignore @@ -56,5 +56,15 @@ gpu-blob CSV* Graphs* -# VS Code -.vscode \ No newline at end of file +# IDE +.vscode + +# MAC metadata +.DS_Store + +# CSV files and graphs +*.csv +*.png + +# Bash scripts to run on different systems +*.sh diff --git a/.idea/GPU-BLAS-Offload-Benchmark.iml b/.idea/GPU-BLAS-Offload-Benchmark.iml new file mode 100644 index 0000000..190534e --- /dev/null +++ b/.idea/GPU-BLAS-Offload-Benchmark.iml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/.idea/codeStyles/codeStyleConfig.xml b/.idea/codeStyles/codeStyleConfig.xml new file mode 100644 index 0000000..a55e7a1 --- /dev/null +++ b/.idea/codeStyles/codeStyleConfig.xml @@ -0,0 +1,5 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..830d3c8 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..eff3984 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml new file mode 100644 index 0000000..461bf83 --- /dev/null +++ b/.idea/workspace.xml @@ -0,0 +1,628 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + { + "associatedIndex": 2 +} + + + + { + "keyToString": { + "C/C++ File.main.cc.executor": "Run", + "RunOnceActivity.OpenProjectViewOnStart": "true", + "RunOnceActivity.ShowReadmeOnStart": "true", + "RunOnceActivity.cidr.known.project.marker": "true", + "RunOnceActivity.readMode.enableVisualFormatting": "true", + "cf.advertisement.text.has.clang-format": "true", + "cf.first.check.clang-format": "false", + "cidr.known.project.marker": "true", + "git-widget-placeholder": "sparse", + "last_opened_file_path": "/Users/no22498/Documents/GPU-BLAS-Offload-Benchmark", + "node.js.detected.package.eslint": "true", + "node.js.detected.package.tslint": "true", + "node.js.selected.package.eslint": "(autodetect)", + "node.js.selected.package.tslint": "(autodetect)", + "nodejs_package_manager_path": "npm", + "settings.editor.selected.configurable": "preferences.sourceCode.C/C++", + "structure.view.defaults.are.configured": "true", + "vue.rearranger.settings.migration": "true" + } +} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1705671236426 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/AOCL/gemm.hh b/AOCL/gemm.hh index 3c6b5c0..f418bdc 100644 --- a/AOCL/gemm.hh +++ b/AOCL/gemm.hh @@ -23,6 +23,7 @@ class gemm_cpu : public gemm { private: /** Make call to the GEMM kernel. */ void callGemm() override { + if constexpr (std::is_same_v) { bli_sgemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, m_, n_, k_, &alpha, A_, rowStride, std::max(1, m_), B_, rowStride, std::max(1, k_), diff --git a/AOCL/spmdnm.hh b/AOCL/spmdnm.hh new file mode 100644 index 0000000..f47007c --- /dev/null +++ b/AOCL/spmdnm.hh @@ -0,0 +1,336 @@ +#pragma once + +#ifdef CPU_AOCL +#include "aoclsparse.h" + +#include +#include + +#include "../include/kernels/CPU/spmdnm.hh" +#include "../include/utilities.hh" + +namespace cpu { +template +class spmdnm_cpu : public spmdnm { +public: + using spmdnm::spmdnm; + using spmdnm::callConsume; + using spmdnm::initInputMatrices; + using spmdnm::m_; + using spmdnm::n_; + using spmdnm::k_; + using spmdnm::B_; + using spmdnm::C_; + using spmdnm::sparsity_; + using spmdnm::type_; + using spmdnm::nnz_; + using spmdnm::iterations_; + + void initialise(int m, int n, int k, double sparsity, + matrixType type, bool binary = false) { + base_ = aoclsparse_index_base_zero; + order_ = aoclsparse_order_row; + + status_ = aoclsparse_create_mat_descr(&A_description_); + if (status_ != aoclsparse_status_success) { + std::cerr << "aoclsparse_create_mat_descr is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl; + printAOCLError(status_); + } + + status_ = aoclsparse_set_mat_index_base(A_description_, base_); + if (status_ != aoclsparse_status_success) { + std::cerr << "aoclsparse_set_mat_index_base is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl; + printAOCLError(status_); + } + + + m_aocl_ = m_ = m; + n_aocl_ = n_ = n; + k_aocl_ = k_ = k; + sparsity_ = sparsity; + type_ = type; + + nnz_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_)); + nnz_aocl_ = nnz_; + + B_ = (T*)calloc(k_ * n_, sizeof(T)); + C_ = (T*)calloc(m_ * n_, sizeof(T)); + + initInputMatrices(); + } + +protected: + void toSparseFormat() override { + + // Initialise datastructures for the CSR format + A_rows_ = new aoclsparse_int[m_ + 1]; + A_cols_ = new aoclsparse_int[nnz_aocl_]; + A_vals_ = new T[nnz_aocl_]; + + if (type_ == matrixType::rmat) { + rMatCSR(A_vals_, A_cols_, A_rows_, m_, k_, nnz_); + } else if (type_ == matrixType::random) { + randomCSR(A_vals_, A_cols_, A_rows_, m_, k_, nnz_); + } else if (type_ == matrixType::finiteElements) { + finiteElementCSR(A_vals_, A_cols_, A_rows_, m_, k_, nnz_); + } else { + std::cerr << "Matrix type not supported" << std::endl; + exit(1); + } + + // Move into the AOCL CSR matrix handle + if constexpr (std::is_same_v) { + status_ = aoclsparse_create_scsr(&A_aocl_, + base_, + m_aocl_, + k_aocl_, + nnz_aocl_, + A_rows_, + A_cols_, + A_vals_); + } else if constexpr (std::is_same_v) { + status_ = aoclsparse_create_dcsr(&A_aocl_, + base_, + m_aocl_, + k_aocl_, + nnz_aocl_, + A_rows_, + A_cols_, + A_vals_); + } + if (status_ != aoclsparse_status_success) { + std::cerr << "aoclsparse_create_?csr is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl; + printAOCLError(status_); + } + } + +private: + void preLoopRequirements() override {} + + void callSpmdnm() override { + operation_ = aoclsparse_operation_none; // Just saying no transposition happening first + if constexpr (std::is_same_v) { + status_ = aoclsparse_scsrmm(operation_, + alpha, + A_aocl_, + A_description_, + order_, + B_, + n_aocl_, + n_aocl_, + beta, + C_, + n_aocl_); + } else if constexpr(std::is_same_v) { + status_ = aoclsparse_dcsrmm(operation_, + alpha, + A_aocl_, + A_description_, + order_, + B_, + n_aocl_, + n_aocl_, + beta, + C_, + n_aocl_); + } + if (status_ != aoclsparse_status_success) { + std::cerr << "aoclsparse_?csrmm is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl; + std::cerr << "\tm_aocl_=" << m_aocl_ << std::endl; + std::cerr << "\tn_aocl_=" << n_aocl_ << std::endl; + std::cerr << "\tk_aocl_=" << k_aocl_ << std::endl; + std::cerr << "\tnnz_aocl_=" << nnz_aocl_ << std::endl; + printAOCLError(status_); + } + } + + void postLoopRequirements() override { + } + + void postCallKernelCleanup() override { + status_ = aoclsparse_destroy_mat_descr(A_description_); + if (status_ != aoclsparse_status_success) { + std::cerr << "aoclsparse_destroy_mat_descr is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl; + printAOCLError(status_); + } + status_ = aoclsparse_destroy(&A_aocl_); + if (status_ != aoclsparse_status_success) { + std::cerr << "aoclsparse_destroy is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl; + printAOCLError(status_); + } + delete[] A_vals_; + delete[] A_cols_; + delete[] A_rows_; + delete[] B_; + delete[] C_; + } + + void printAOCLError(aoclsparse_status stat) { + switch (stat) { + case aoclsparse_status_success: + std::cerr << "SUCCESS - The operation completed successfully"; + break; + case aoclsparse_status_not_implemented: + std::cerr << "NOT_IMPLEMENTED - The requested functionality is not yet implemented in this version"; + break; + case aoclsparse_status_invalid_pointer: + std::cerr << "INVALID_POINTER - One or more pointer parameters are NULL or otherwise invalid"; + break; + case aoclsparse_status_invalid_size: + std::cerr << "INVALID_SIZE - One or more size parameters (m, n, nnz, etc.) contain an invalid value (e.g., negative or zero where positive required)"; + break; + case aoclsparse_status_internal_error: + std::cerr << "INTERNAL_ERROR - Internal library failure"; + break; + case aoclsparse_status_invalid_value: + std::cerr << "INVALID_VALUE - Input parameters contain an invalid value (e.g., invalid enum value, base index neither 0 nor 1)"; + break; + case aoclsparse_status_invalid_index_value: + std::cerr << "INVALID_INDEX_VALUE - At least one index value is invalid (e.g., negative or out of bounds)"; + break; + case aoclsparse_status_maxit: + std::cerr << "MAXIT - function stopped after reaching number of iteration limit"; + break; + case aoclsparse_status_user_stop: + std::cerr << "USER_STOP - user requested termination"; + break; + case aoclsparse_status_wrong_type: + std::cerr << "WRONG_TYPE - Data type mismatch (e.g., matrix datatypes don't match between operations)"; + break; + case aoclsparse_status_memory_error: + std::cerr << "MEMORY_ERROR - memory allocation failure"; + break; + case aoclsparse_status_numerical_error: + std::cerr << "NUMERICAL_ERROR - numerical error, e.g., matrix is not positive definite, devide-by-zero error"; + break; + case aoclsparse_status_invalid_operation: + std::cerr << "INVALID_OPERATION - cannot proceed with the request at this point"; + break; + case aoclsparse_status_unsorted_input: + std::cerr << "UNSORTED_INPUT - the input matrices are not sorted"; + break; + case aoclsparse_status::aoclsparse_status_invalid_kid: + std::cerr << "INVALID_KID - user requested kernel id was not available"; + break; + default: + std::cerr << "UNKNOWN_STATUS - Unrecognized status code (" + std::to_string(stat) + ")"; + break; + } + std::cerr << std::endl; + exit(1); + } + + void internalCheck(aoclsparse_int maj_dim, + aoclsparse_int min_dim, + aoclsparse_int nnz, + const aoclsparse_int *idx_ptr, + const aoclsparse_int *indices, + const void *val, + int shape, + int base) { + if (idx_ptr == nullptr) { + std::cerr << "INVALID ROWS ARRAY" << std::endl; + exit(1); + } + if (indices == nullptr){ + std::cerr << "INVALID COLS ARRAY" << std::endl; + exit(1); + } + if (val == nullptr){ + std::cerr << "INVALID VALS ARRAY" << std::endl; + exit(1); + } + + if ((min_dim < 0) || (maj_dim < 0) || (nnz < 0)) { + std::cerr << "Wrong min_dim/maj_dim/nnz" << std::endl; + exit(1); + } + + if ((idx_ptr[0] - base) != 0) { + std::cerr << "Wrong csr_row_ptr[0] or csc.col_ptr[0]" << std::endl; + exit(1); + } + + if ((idx_ptr[maj_dim] - base) != nnz) { + std::cerr << "Wrong csr_row_ptr[m]!=nnz or csc.col_ptr[n]!=nnz" << std::endl; + exit(1); + } + for (aoclsparse_int i = 1; i <= maj_dim; i++) { + if (idx_ptr[i - 1] > idx_ptr[i]) { + std::cerr << "Wrong csr_row_ptr/csc.col_ptr - not nondecreasing" << std::endl; + exit (1); + } + } + + // assume indices are fully sorted & fulldiag matrix unless proved otherwise + int sort = 1; + bool fulldiag = true; + + aoclsparse_int idxstart, idxend, j, jmin = 0, jmax = min_dim - 1; + for (aoclsparse_int i = 0; i < maj_dim; i++) { + idxend = idx_ptr[i + 1] - base; + idxstart = idx_ptr[i] - base; + if (shape == 1) { + jmin = 0; + jmax = i; + } else if (shape == 2) { + jmin = i; + jmax = min_dim - 1; + } + // check if visited D, U group within this row + bool diagonal = false, upper = false; + aoclsparse_int prev = -1; // holds previous col index, initially set to -1 + + for (aoclsparse_int idx = idxstart; idx < idxend; idx++) { + j = indices[idx] - base; + if (j < jmin || j > jmax) { + std::cerr << "Wrong index - out of bounds or triangle, @idx=" << idx << ": j=" << j << ", i=" << i << std::endl; + exit(1); + } + // check for sorting pattern for each element in a row + if (sort != 3) { + if (prev > j) sort = 2; // unsorted col idx (duplicate elements are allowed) + else prev = j; // update previous col index + + // check for group-order + if ((j <= i && upper) || (j < i && diagonal)) sort = 3; + } + if (j > i) upper = true; + else if(j == i) { + if (diagonal) { + std::cerr << "Wrong diag - duplicate diag for i=j=" << i << std::endl; + exit(1); + } + // diagonal element visited + diagonal = true; + } + } + if (!diagonal && i < min_dim) fulldiag = false; // missing diagonal + } + } + + aoclsparse_status status_; + aoclsparse_order order_; + + aoclsparse_operation operation_; + aoclsparse_index_base base_; + + aoclsparse_mat_descr A_description_; + aoclsparse_matrix A_aocl_; + aoclsparse_int* A_rows_; + aoclsparse_int* A_cols_; + T* A_vals_; + + aoclsparse_int m_aocl_; + aoclsparse_int n_aocl_; + aoclsparse_int k_aocl_; + aoclsparse_int nnz_aocl_; + + + const T alpha = ALPHA; + const T beta = BETA; +}; +} + + +#endif diff --git a/AOCL/spmdnv.hh b/AOCL/spmdnv.hh new file mode 100644 index 0000000..d529713 --- /dev/null +++ b/AOCL/spmdnv.hh @@ -0,0 +1,273 @@ +#pragma once + +#ifdef CPU_AOCL + +#include "aoclsparse.h" +#include + +#include "../include/kernels/CPU/spmdnv.hh" +#include "../include/utilities.hh" + +namespace cpu { +template +class spmdnv_cpu : public spmdnv { +public: + using spmdnv::spmdnv; + using spmdnv::callConsume; + using spmdnv::initInputMatrixVector; + using spmdnv::m_; + using spmdnv::n_; + using spmdnv::x_; + using spmdnv::y_; + using spmdnv::sparsity_; + using spmdnv::type_; + using spmdnv::nnz_; + using spmdnv::iterations_; + + void initialise(int m, int n, double sparsity, matrixType type, + bool binary = false) { + if (print_) std::cout << "=========== Matrix = " << m << "x" << n << " ===========" << std::endl; + base_ = aoclsparse_index_base_zero; + operation_ = aoclsparse_operation_none; + + m_aocl_ = m_ = m; + n_aocl_ = n_ = n; + sparsity_ = sparsity; + type_ = type; + + nnz_ = 1 + (uint64_t)((double)m_ * (double)n_ * (1.0 - sparsity_)); + nnz_aocl_ = nnz_; + + x_ = (T*)calloc(n_, sizeof(T)); + y_ = (T*)calloc(m_, sizeof(T)); + + if (print_) std::cout << "About to initialise matrices" << std::endl; + initInputMatrixVector(); + + status_ = aoclsparse_create_mat_descr(&A_description_); + if (status_ != aoclsparse_status_success) { + std::cerr << "aoclsparse_create_mat_descr failing for A" << std::endl; + printAOCLError(status_); + } + } + +protected: + void toSparseFormat() override { + A_vals_ = (T*)calloc(nnz_aocl_, sizeof(T)); + A_cols_ = (aoclsparse_int*)calloc(nnz_aocl_, sizeof(aoclsparse_int)); + A_rows_ = (aoclsparse_int*)calloc(m_ + 1, sizeof(aoclsparse_int)); + if (type_ == matrixType::rmat) { + rMatCSR(A_vals_, A_cols_, A_rows_, m_, n_, nnz_); + } else if (type_ == matrixType::random) { + randomCSR(A_vals_, A_cols_, A_rows_, m_, n_, nnz_); + } else if (type_ == matrixType::finiteElements) { + finiteElementCSR(A_vals_, A_cols_, A_rows_, m_, n_, nnz_); + } else { + std::cerr << "Matrix type not supported" << std::endl; + exit(1); + } + + + // Move into the AOCL CSR matrix handle + if constexpr (std::is_same_v) { + status_ = aoclsparse_create_scsr(&A_aocl_, + base_, + m_aocl_, + n_aocl_, + nnz_aocl_, + A_rows_, + A_cols_, + A_vals_); + } else if constexpr (std::is_same_v) { + status_ = aoclsparse_create_dcsr(&A_aocl_, + base_, + m_aocl_, + n_aocl_, + nnz_aocl_, + A_rows_, + A_cols_, + A_vals_); + } + if (status_ != aoclsparse_status_success) { + std::cerr << "aoclsparse_create_?csr is failing with problem size of " << m_ << "x" << n_ << " . " << n_ << "x" << n_ << std::endl; + printAOCLError(status_); + } else if (print_) { + std::cout << "aoclsparse_create_?csr success" << std::endl; + } + } + +private: + void preLoopRequirements() override { + status_ = aoclsparse_set_mv_hint(A_aocl_, + operation_, + A_description_, + 5); // Currently hard coded iternation count + if (status_ != aoclsparse_status_success) { + std::cerr << "aoclsparse_set_mv_hint failing" << std::endl; + printAOCLError(status_); + } else if (print_) { + std::cout << "aoclsparse_set_mv_hint success" << std::endl; + } + + status_ = aoclsparse_optimize(A_aocl_); + if (status_ != aoclsparse_status_success) { + std::cerr << "aoclsparse_optimize failing" << std::endl; + printAOCLError(status_); + } else if (print_) { + std::cout << "aoclsparse_optimize success" << std::endl; + } + } + + void callSpMDnV() override { + if constexpr (std::is_same_v) { + status_ = aoclsparse_smv(operation_, + &alpha, + A_aocl_, + A_description_, + x_, + &beta, + y_); + } else if constexpr (std::is_same_v) { + status_ = aoclsparse_dmv(operation_, + &alpha, + A_aocl_, + A_description_, + x_, + &beta, + y_); + } + if (status_ != aoclsparse_status_success) { + std::cerr << "aoclsparse_?mv failing" << std::endl; + printAOCLError(status_); + } else if (print_) { + std::cout << "aoclsparse_?mv success" << std::endl; + } + } + + void postLoopRequirements() override { + if (debug) { + std::cout << "========== CPU ==========" << std::endl; + std::cout << "___________________________________________" << std::endl; + std::cout << "x =" << std::endl; + std::cout << "["; + for (int64_t i = 0; i < n_; i++) { + std::cout << x_[i]; + if (i < (n_ - 1)) std::cout << ", "; + } + std::cout << "]" << std::endl; + + std::cout << "y =" << std::endl; + std::cout << "["; + for (int64_t i = 0; i < m_; i++) { + std::cout << y_[i]; + if (i < (m_ - 1)) std::cout << ", "; + } + std::cout << "]" << std::endl; + std::cout << "___________________________________________" << std::endl; + } + } + + void postCallKernelCleanup() override { + status_ = aoclsparse_destroy_mat_descr(A_description_); + if (status_ != aoclsparse_status_success) { + std::cerr << "aoclsparse_destroy_mat_descr failing" << std::endl; + printAOCLError(status_); + } else if (print_) { + std::cout << "aoclsparse_destroy_mat_descr success" << std::endl; + } + + status_ = aoclsparse_destroy(&A_aocl_); + if (status_ != aoclsparse_status_success) { + std::cerr << "aoclsparse_destroy failing" << std::endl; + printAOCLError(status_); + } else if (print_) { + std::cout << "aoclsparse_destroy success" << std::endl; + } + + delete[] A_vals_; + delete[] A_cols_; + delete[] A_rows_; + delete[] x_; + delete[] y_; + } + + void printAOCLError(aoclsparse_status stat) { + switch (stat) { + case aoclsparse_status_success: + std::cerr << "SUCCESS - The operation completed successfully"; + break; + case aoclsparse_status_not_implemented: + std::cerr << "NOT_IMPLEMENTED - The requested functionality is not yet implemented in this version"; + break; + case aoclsparse_status_invalid_pointer: + std::cerr << "INVALID_POINTER - One or more pointer parameters are NULL or otherwise invalid"; + break; + case aoclsparse_status_invalid_size: + std::cerr << "INVALID_SIZE - One or more size parameters (m, n, nnz, etc.) contain an invalid value (e.g., negative or zero where positive required)"; + break; + case aoclsparse_status_internal_error: + std::cerr << "INTERNAL_ERROR - Internal library failure"; + break; + case aoclsparse_status_invalid_value: + std::cerr << "INVALID_VALUE - Input parameters contain an invalid value (e.g., invalid enum value, base index neither 0 nor 1)"; + break; + case aoclsparse_status_invalid_index_value: + std::cerr << "INVALID_INDEX_VALUE - At least one index value is invalid (e.g., negative or out of bounds)"; + break; + case aoclsparse_status_maxit: + std::cerr << "MAXIT - function stopped after reaching number of iteration limit"; + break; + case aoclsparse_status_user_stop: + std::cerr << "USER_STOP - user requested termination"; + break; + case aoclsparse_status_wrong_type: + std::cerr << "WRONG_TYPE - Data type mismatch (e.g., matrix datatypes don't match between operations)"; + break; + case aoclsparse_status_memory_error: + std::cerr << "MEMORY_ERROR - memory allocation failure"; + break; + case aoclsparse_status_numerical_error: + std::cerr << "NUMERICAL_ERROR - numerical error, e.g., matrix is not positive definite, devide-by-zero error"; + break; + case aoclsparse_status_invalid_operation: + std::cerr << "INVALID_OPERATION - cannot proceed with the request at this point"; + break; + case aoclsparse_status_unsorted_input: + std::cerr << "UNSORTED_INPUT - the input matrices are not sorted"; + break; + case aoclsparse_status::aoclsparse_status_invalid_kid: + std::cerr << "INVALID_KID - user requested kernel id was not available"; + break; + default: + std::cerr << "UNKNOWN_STATUS - Unrecognized status code (" + std::to_string(stat) + ")"; + break; + } + std::cerr << std::endl; + exit(1); + } + + bool print_ = false; + bool debug = false; + + aoclsparse_status status_; + + aoclsparse_operation operation_; + aoclsparse_index_base base_; + + aoclsparse_matrix A_aocl_; + aoclsparse_int* A_rows_; + aoclsparse_int* A_cols_; + T* A_vals_; + aoclsparse_int m_aocl_; + aoclsparse_int n_aocl_; + aoclsparse_int nnz_aocl_; + + aoclsparse_mat_descr A_description_; + + const T alpha = ALPHA; + const T beta = BETA; +}; +} + + +#endif diff --git a/AOCL/spmspm.hh b/AOCL/spmspm.hh new file mode 100644 index 0000000..38c5c3f --- /dev/null +++ b/AOCL/spmspm.hh @@ -0,0 +1,375 @@ +#pragma once + +#ifdef CPU_AOCL +#include "aoclsparse.h" + +#include +#include +#include + +#include "../include/kernels/CPU/spmspm.hh" +#include "../include/utilities.hh" + +namespace cpu { +template +class spmspm_cpu : public spmspm { +public: + using spmspm::spmspm; + using spmspm::callConsume; + using spmspm::initInputMatrices; + using spmspm::m_; + using spmspm::n_; + using spmspm::k_; + using spmspm::sparsity_; + using spmspm::type_; + using spmspm::A_nnz_; + using spmspm::B_nnz_; + using spmspm::iterations_; + using spmspm::C_rows_; + using spmspm::C_cols_; + using spmspm::C_vals_; + using spmspm::C_nnz_; + + void initialise(int m, int n, int k, double sparsity, matrixType type, + bool binary = false) { + sparsity_ = sparsity; + type_ = type; + + m_aocl_ = m_ = m; + n_aocl_ = n_ = n; + k_aocl_ = k_ = k; + + + uint64_t total_elements_A = (uint64_t)m_ * (uint64_t)k_; + uint64_t total_elements_B = (uint64_t)k_ * (uint64_t)n_; + nnzA_aocl_ = A_nnz_ = 1 + (uint64_t)((double)total_elements_A * (1.0 - sparsity)); + nnzB_aocl_ = B_nnz_ = 1 + (uint64_t)((double)total_elements_B * (1.0 - sparsity)); + C_allocated = false; + + base_ = aoclsparse_index_base_zero; + operationA_ = aoclsparse_operation_none; + operationB_ = aoclsparse_operation_none; + + status_ = aoclsparse_create_mat_descr(&A_description_); + if (status_ != aoclsparse_status_success) { + printAOCLError(status_); + } + status_ = aoclsparse_create_mat_descr(&B_description_); + if (status_ != aoclsparse_status_success) { + printAOCLError(status_); + } + initInputMatrices(); + } + +protected: + void toSparseFormat() override { + A_rows_ = (aoclsparse_int*)calloc(m_ + 1, sizeof(aoclsparse_int)); + A_cols_ = (aoclsparse_int*)calloc(nnzA_aocl_, sizeof(aoclsparse_int)); + A_vals_ = (T*)calloc(nnzA_aocl_, sizeof(T)); + if (A_rows_ == nullptr || A_cols_ == nullptr || A_vals_ == nullptr) { + std::cerr << "Failed to allocate memory for A CSR arrays with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl; + exit(1); + } + + // Initialise datastructures for the CSR format + B_rows_ = (aoclsparse_int*)calloc(k_ + 1, sizeof(aoclsparse_int)); + B_cols_ = (aoclsparse_int*)calloc(nnzB_aocl_, sizeof(aoclsparse_int)); + B_vals_ = (T*)calloc(nnzB_aocl_, sizeof(T)); + if (B_rows_ == nullptr || B_cols_ == nullptr || B_vals_ == nullptr) { + std::cerr << "Failed to allocate memory for B CSR arrays with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl; + exit(1); + } + + int seedOffset = 0; + do { + if (type_ == matrixType::rmat) { + rMatCSR(A_vals_, A_cols_, A_rows_, m_, k_, A_nnz_, SEED + seedOffset++); + rMatCSR(B_vals_, B_cols_, B_rows_, k_, n_, B_nnz_, SEED + seedOffset++); + } else if (type_ == matrixType::random) { + randomCSR(A_vals_, A_cols_, A_rows_, m_, k_, A_nnz_, SEED + seedOffset++); + randomCSR(B_vals_, B_cols_, B_rows_, k_, n_, B_nnz_, SEED + seedOffset++); + } else if (type_ == matrixType::finiteElements) { + finiteElementCSR(A_vals_, A_cols_, A_rows_, m_, k_, A_nnz_, SEED + seedOffset++); + finiteElementCSR(B_vals_, B_cols_, B_rows_, k_, n_, B_nnz_, SEED + seedOffset++); + } else { + std::cerr << "Matrix type not supported" << std::endl; + exit(1); + } + } while (calcCNNZ(m_, A_nnz_, A_rows_, A_cols_, k_, B_nnz_, B_rows_, B_cols_) == 0); + + // Move into the AOCL CSR matrix handle + if constexpr (std::is_same_v) { + status_ = aoclsparse_create_scsr(&A_aocl_, + base_, + m_aocl_, + k_aocl_, + nnzA_aocl_, + A_rows_, + A_cols_, + A_vals_); + } else if constexpr (std::is_same_v) { + status_ = aoclsparse_create_dcsr(&A_aocl_, + base_, + m_aocl_, + k_aocl_, + nnzA_aocl_, + A_rows_, + A_cols_, + A_vals_); + } + if (status_ != aoclsparse_status_success) { + std::cerr << "aoclsparse_create_?csr for A is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl; + printAOCLError(status_); + } + + // Now sort the matrix -- needed for this AOCL function + status_ = aoclsparse_order_mat(A_aocl_); + if (status_ != aoclsparse_status_success) { + std::cerr << "aoclsparse_order_mat for A is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl; + printAOCLError(status_); + } + + // Move into the AOCL CSR matrix handle + if constexpr (std::is_same_v) { + status_ = aoclsparse_create_scsr(&B_aocl_, + base_, + k_aocl_, + n_aocl_, + nnzB_aocl_, + B_rows_, + B_cols_, + B_vals_); + } else if constexpr (std::is_same_v) { + status_ = aoclsparse_create_dcsr(&B_aocl_, + base_, + k_aocl_, + n_aocl_, + nnzB_aocl_, + B_rows_, + B_cols_, + B_vals_); + } + if (status_ != aoclsparse_status_success) { + std::cerr << "aoclsparse_create_?csr for B is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl; + printAOCLError(status_); + } + + // Now sort the matrix -- needed for this AOCL function + status_ = aoclsparse_order_mat(B_aocl_); + if (status_ != aoclsparse_status_success) { + std::cerr << "aoclsparse_order_mat for B is failing with problem size of " << m_ << "x" << k_ << " . " << k_ << "x" << n_ << std::endl; + printAOCLError(status_); + } + } + +private: + void preLoopRequirements() override { + } + + void callSpmspm() override { + if (C_allocated) { + if (C_vals_ != nullptr) { + free(C_vals_); + C_vals_ = nullptr; + } + if (C_cols_aocl_ != nullptr) { + free(C_cols_aocl_); + C_cols_aocl_ = nullptr; + } + if (C_rows_aocl_ != nullptr) { + free(C_rows_aocl_); + C_rows_aocl_ = nullptr; + } + C_allocated = false; + } + + request_ = aoclsparse_stage_nnz_count; + status_ = aoclsparse_sp2m(operationA_, + A_description_, + A_aocl_, + operationB_, + B_description_, + B_aocl_, + request_, + &C_aocl_); + if (status_ != aoclsparse_status_success) { + std::cerr << "aoclsparse_sp2m failing with request = aoclsparse_stage_nnz_count" << std::endl; + printAOCLError(status_); + } + + request_ = aoclsparse_stage_finalize; + status_ = aoclsparse_sp2m(operationA_, + A_description_, + A_aocl_, + operationB_, + B_description_, + B_aocl_, + request_, + &C_aocl_); + if (status_ != aoclsparse_status_success) { + std::cerr << "aoclsparse_sp2m failing with request = aoclsparse_stage_finalize" << std::endl; + printAOCLError(status_); + } + + if constexpr (std::is_same_v) { + status_ = aoclsparse_export_scsr(C_aocl_, + &base_, + &C_M, + &C_N, + &nnzC_aocl_, + &C_rows_aocl_, + &C_cols_aocl_, + &C_vals_); + } else if constexpr (std::is_same_v) { + status_ = aoclsparse_export_dcsr(C_aocl_, + &base_, + &C_M, + &C_N, + &nnzC_aocl_, + &C_rows_aocl_, + &C_cols_aocl_, + &C_vals_); + } + if (status_ != aoclsparse_status_success) { + std::cerr << "aoclsparse_export_zcsr failing" << std::endl; + printAOCLError(status_); + } + C_allocated = true; + } + + void postLoopRequirements() override { + C_nnz_ = nnzC_aocl_; // Needed for checksum + } + + void postCallKernelCleanup() override { + status_ = aoclsparse_destroy_mat_descr(A_description_); + if (status_ != aoclsparse_status_success) { + std::cerr << "aoclsparse_destroy_mat_descr failing for A_description_" << std::endl; + printAOCLError(status_); + } + status_ = aoclsparse_destroy_mat_descr(B_description_); + if (status_ != aoclsparse_status_success) { + std::cerr << "aoclsparse_destroy_mat_descr failing for B_description_" << std::endl; + printAOCLError(status_); + } + + status_ = aoclsparse_destroy(&A_aocl_); + if (status_ != aoclsparse_status_success) { + std::cerr << "aoclsparse_destroy failing for A" << std::endl; + printAOCLError(status_); + } + status_ = aoclsparse_destroy(&B_aocl_); + if (status_ != aoclsparse_status_success) { + std::cerr << "aoclsparse_destroy failing for B" << std::endl; + printAOCLError(status_); + } + + status_ = aoclsparse_destroy(&C_aocl_); + if (status_ != aoclsparse_status_success) { + std::cerr << "aoclsparse_destroy failing for C" << std::endl; + printAOCLError(status_); + } + free(A_rows_); + free(A_cols_); + free(A_vals_); + free(B_rows_); + free(B_cols_); + free(B_vals_); + } + + void printAOCLError(aoclsparse_status stat) { + switch (stat) { + case aoclsparse_status_success: + std::cerr << "SUCCESS - The operation completed successfully"; + break; + case aoclsparse_status_not_implemented: + std::cerr << "NOT_IMPLEMENTED - The requested functionality is not yet implemented in this version"; + break; + case aoclsparse_status_invalid_pointer: + std::cerr << "INVALID_POINTER - One or more pointer parameters are NULL or otherwise invalid"; + break; + case aoclsparse_status_invalid_size: + std::cerr << "INVALID_SIZE - One or more size parameters (m, n, nnz, etc.) contain an invalid value (e.g., negative or zero where positive required)"; + break; + case aoclsparse_status_internal_error: + std::cerr << "INTERNAL_ERROR - Internal library failure"; + break; + case aoclsparse_status_invalid_value: + std::cerr << "INVALID_VALUE - Input parameters contain an invalid value (e.g., invalid enum value, base index neither 0 nor 1)"; + break; + case aoclsparse_status_invalid_index_value: + std::cerr << "INVALID_INDEX_VALUE - At least one index value is invalid (e.g., negative or out of bounds)"; + break; + case aoclsparse_status_maxit: + std::cerr << "MAXIT - function stopped after reaching number of iteration limit"; + break; + case aoclsparse_status_user_stop: + std::cerr << "USER_STOP - user requested termination"; + break; + case aoclsparse_status_wrong_type: + std::cerr << "WRONG_TYPE - Data type mismatch (e.g., matrix datatypes don't match between operations)"; + break; + case aoclsparse_status_memory_error: + std::cerr << "MEMORY_ERROR - memory allocation failure"; + break; + case aoclsparse_status_numerical_error: + std::cerr << "NUMERICAL_ERROR - numerical error, e.g., matrix is not positive definite, devide-by-zero error"; + break; + case aoclsparse_status_invalid_operation: + std::cerr << "INVALID_OPERATION - cannot proceed with the request at this point"; + break; + case aoclsparse_status_unsorted_input: + std::cerr << "UNSORTED_INPUT - the input matrices are not sorted"; + break; + case aoclsparse_status::aoclsparse_status_invalid_kid: + std::cerr << "INVALID_KID - user requested kernel id was not available"; + break; + default: + std::cerr << "UNKNOWN_STATUS - Unrecognized status code (" + std::to_string(stat) + ")"; + break; + } + std::cerr << std::endl; + exit(1); + } + + aoclsparse_status status_; + + aoclsparse_operation operationA_; + aoclsparse_operation operationB_; + aoclsparse_index_base base_; + aoclsparse_request request_; + + aoclsparse_matrix A_aocl_; + aoclsparse_int* A_rows_ = nullptr; + aoclsparse_int* A_cols_ = nullptr; + T* A_vals_ = nullptr; + + aoclsparse_matrix B_aocl_; + aoclsparse_int* B_rows_ = nullptr; + aoclsparse_int* B_cols_ = nullptr; + T* B_vals_ = nullptr; + + aoclsparse_matrix C_aocl_; + aoclsparse_int* C_rows_aocl_ = nullptr; + aoclsparse_int* C_cols_aocl_ = nullptr; + bool C_allocated = false; + + aoclsparse_int m_aocl_; + aoclsparse_int n_aocl_; + aoclsparse_int k_aocl_; + aoclsparse_int nnzA_aocl_; + aoclsparse_int nnzB_aocl_; + aoclsparse_int nnzC_aocl_; + + aoclsparse_int C_M, C_N; + + aoclsparse_mat_descr A_description_; + aoclsparse_mat_descr B_description_; + + const T alpha = ALPHA; + const T beta = BETA; +}; +} + + +#endif diff --git a/ArmPL/gemm.hh b/ArmPL/gemm.hh index af7f428..10903d8 100644 --- a/ArmPL/gemm.hh +++ b/ArmPL/gemm.hh @@ -1,7 +1,7 @@ #pragma once #ifdef CPU_ARMPL -#include +#include "armpl.h" #include #include @@ -36,8 +36,7 @@ class gemm_cpu : public gemm { std::max(1, m_)); } else { // Un-specialised class will not do any work - print error and exit. - std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported." - << std::endl; + std::cout << "ERROR - Datatype for ArmPL CPU GEMM kernel not supported." << std::endl; exit(1); } // Ensure compiler doesn't optimise away the work being done diff --git a/ArmPL/gemv.hh b/ArmPL/gemv.hh index cc0e9bf..c568c99 100644 --- a/ArmPL/gemv.hh +++ b/ArmPL/gemv.hh @@ -1,7 +1,7 @@ #pragma once #ifdef CPU_ARMPL -#include +#include "armpl.h" #include #include @@ -34,8 +34,7 @@ class gemv_cpu : public gemv { std::max(1, m_), x_, vecIncrement_, beta, y_, vecIncrement_); } else { // Un-specialised class will not do any work - print error and exit. - std::cout << "ERROR - Datatype for ArmPL CPU GEMV kernel not supported." - << std::endl; + std::cout << "ERROR - Datatype for ArmPL CPU GEMV kernel not supported." << std::endl; exit(1); } // Ensure compiler doesn't optimise away the work being done diff --git a/ArmPL/spmdnm.hh b/ArmPL/spmdnm.hh new file mode 100644 index 0000000..4f53c10 --- /dev/null +++ b/ArmPL/spmdnm.hh @@ -0,0 +1,43 @@ +#pragma once + +#ifdef CPU_ARMPL + +#include "../include/kernels/CPU/spmdnm.hh" +#include "../include/utilities.hh" + +namespace cpu { +template +class spmdnm_cpu : public spmdnm { +public: + using spmdnm::spmdnm; + using spmdnm::callConsume; + using spmdnm::initInputMatrices; + using spmdnm::m_; + using spmdnm::n_; + using spmdnm::k_; + using spmdnm::B_; + using spmdnm::C_; + using spmdnm::sparsity_; + using spmdnm::type_; + using spmdnm::nnz_; + using spmdnm::iterations_; + + void initialise(int m, int n, int k, double sparsity, + matrixType type, bool binary = false) {} + +protected: + void toSparseFormat() override {} + +private: + void preLoopRequirements() override {} + + void callSpmdnm() override {} + + void postLoopRequirements() override {} + + void postCallKernelCleanup() override {} +}; +} + + +#endif diff --git a/ArmPL/spmdnv.hh b/ArmPL/spmdnv.hh new file mode 100644 index 0000000..7b0cf93 --- /dev/null +++ b/ArmPL/spmdnv.hh @@ -0,0 +1,205 @@ +#pragma once + +#ifdef CPU_ARMPL +#include +#include +#include "armpl.h" +#include + +#include + +#include "../include/kernels/CPU/spmdnv.hh" +#include "../include/utilities.hh" + +namespace cpu { +/** A class for GEMM CPU BLAS kernels. */ +template +class spmdnv_cpu : public spmdnv { +public: + using spmdnv::spmdnv; + using spmdnv::callConsume; + using spmdnv::initInputMatrixVector; + using spmdnv::m_; + using spmdnv::n_; + using spmdnv::x_; + using spmdnv::y_; + using spmdnv::sparsity_; + using spmdnv::type_; + using spmdnv::nnz_; + using spmdnv::iterations_; + + /** Initialise the required data structures. */ + void initialise(int m, int n, double sparsity, matrixType type, + bool binary = false) { + m_armpl_ = m_ = m; + n_armpl_ = n_ = n; + sparsity_ = sparsity; + type_ = type; + + nnz_armpl_ = nnz_ = 1 + (uint64_t)((double)m_ * (double)n_ * (1.0 - sparsity_)); + + x_ = (T*)calloc(n_, sizeof(T)); + y_ = (T*)calloc(m_, sizeof(T)); + + // Initialise the matrix and vectors + initInputMatrixVector(); + } + +protected: + void toSparseFormat() override { + // Make arrays for A + A_vals_ = (T*)calloc(nnz_armpl_, sizeof(T)); + A_cols_ = (armpl_int_t*)calloc(nnz_armpl_, sizeof(armpl_int_t)); + A_rows_ = (armpl_int_t*)calloc(m_ + 1, sizeof(armpl_int_t)); + + // Fill the CSR arrays + if (type_ == matrixType::rmat) { + rMatCSR(A_vals_, A_cols_, A_rows_, m_armpl_, n_armpl_, nnz_); + } else if (type_ == matrixType::random) { + randomCSR(A_vals_, A_cols_, A_rows_, m_armpl_, n_armpl_, nnz_); + } else if (type_ == matrixType::finiteElements) { + finiteElementCSR(A_vals_, A_cols_, A_rows_, m_armpl_, n_armpl_, nnz_); + } else { + std::cerr << "Matrix type not supported" << std::endl; + exit(1); + } + + // Create the armpl object for this sparse matrix + if constexpr (std::is_same_v) { + status_ = armpl_spmat_create_csr_s(&A_armpl_, + m_armpl_, + n_armpl_, + A_rows_, + A_cols_, + A_vals_, + 0); + } else if constexpr (std::is_same_v) { + status_ = armpl_spmat_create_csr_d(&A_armpl_, + m_armpl_, + n_armpl_, + A_rows_, + A_cols_, + A_vals_, + 0); + } else { + // Un-specialised class will not do any work - print error and exit. + std::cerr << "ERROR - Datatype for ArmPL CPU SpMDnV kernel not supported." << std::endl; + exit(1); + } + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cerr << "ERROR " << status_ << std::endl; + exit(1); + } + } + +private:/** Perform any required steps before calling the SpMDnV kernel that should + * be timed. */ + void preLoopRequirements() override { + // Give the library some hints so it can optimise the performance of the kernel + status_ = armpl_spmat_hint(A_armpl_, + ARMPL_SPARSE_HINT_MEMORY, + ARMPL_SPARSE_MEMORY_NOALLOCS); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cerr << "ERROR " << status_ << std::endl; + exit(1); + } + + status_ = armpl_spmat_hint(A_armpl_, + ARMPL_SPARSE_HINT_STRUCTURE, + ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cerr << "ERROR " << status_ << std::endl; + exit(1); + } + + status_ = armpl_spmat_hint(A_armpl_, + ARMPL_SPARSE_HINT_SPMV_INVOCATIONS, + ARMPL_SPARSE_INVOCATIONS_FEW); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cerr << "ERROR " << status_ << std::endl; + exit(1); + } + + status_ = armpl_spmat_hint(A_armpl_, + ARMPL_SPARSE_HINT_SPMV_OPERATION, + ARMPL_SPARSE_OPERATION_NOTRANS); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cerr << "ERROR " << status_ << std::endl; + exit(1); + } + + // Now optimise the matrix for SpMV based on the hints given + status_ = armpl_spmv_optimize(A_armpl_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cerr << "ERROR " << status_ << std::endl; + exit(1); + } + } + + /** Make call to the SpMDnV kernel. */ + void callSpMDnV() override { + if constexpr (std::is_same_v) { + status_ = armpl_spmv_exec_s(ARMPL_SPARSE_OPERATION_NOTRANS, + alpha, + A_armpl_, + x_, + beta, + y_); + } else if constexpr (std::is_same_v) { + status_ = armpl_spmv_exec_d(ARMPL_SPARSE_OPERATION_NOTRANS, + alpha, + A_armpl_, + x_, + beta, + y_); + } else { + // Un-specialised class will not do any work - print error and exit. + std::cerr << "ERROR - Datatype for ArmPL CPU GEMV kernel not supported." << std::endl; + exit(1); + } + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cerr << "ERROR: " << status_ << std::endl; + exit(1); + } + + // Ensure compiler doesn't optimise away the work being done + callConsume(); + } + + + + /** Perform any required steps after calling the SpMDnV kernel that should + * be timed. */ + void postLoopRequirements() override {} + + void postCallKernelCleanup() override { + status_ = armpl_spmat_destroy(A_armpl_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cerr << "ERROR " << status_ << std::endl; + exit(1); + } + + free(A_rows_); + free(A_cols_); + free(A_vals_); + free(x_); + free(y_); + } + + armpl_status_t status_; + + armpl_int_t n_armpl_; + armpl_int_t m_armpl_; + armpl_int_t nnz_armpl_; + + T* A_vals_; + armpl_int_t* A_rows_; + armpl_int_t* A_cols_; + + armpl_spmat_t A_armpl_; + + const T alpha = ALPHA; + const T beta = BETA; +}; +} // namespace cpu +#endif \ No newline at end of file diff --git a/ArmPL/spmspm.hh b/ArmPL/spmspm.hh new file mode 100644 index 0000000..bb17392 --- /dev/null +++ b/ArmPL/spmspm.hh @@ -0,0 +1,364 @@ +#pragma once + +#ifdef CPU_ARMPL +#include +#include "armpl.h" +#include + +#include +#include +#include +#include + +#include "../include/kernels/CPU/spmspm.hh" +#include "../include/utilities.hh" + +namespace cpu { +/** A class for sparse matrix-sparse matrix CPU BLAS kernels. */ +template +class spmspm_cpu : public spmspm { +public: + using spmspm::spmspm; + using spmspm::callConsume; + using spmspm::initInputMatrices; + using spmspm::m_; + using spmspm::n_; + using spmspm::k_; + using spmspm::sparsity_; + using spmspm::type_; + using spmspm::A_nnz_; + using spmspm::B_nnz_; + using spmspm::iterations_; + using spmspm::C_vals_; + using spmspm::C_nnz_; + + void initialise(int m, int n, int k, double sparsity, matrixType type, + bool binary = false) { + sparsity_ = sparsity; + type_ = type; + + m_armpl_ = m_ = m; + n_armpl_ = n_ = n; + k_armpl_ = k_ = k; + + + uint64_t total_elements_A = (uint64_t)m_ * (uint64_t)k_; + uint64_t total_elements_B = (uint64_t)k_ * (uint64_t)n_; + nnzA_armpl_ = A_nnz_ = 1 + (uint64_t)((double)total_elements_A * (1.0 - sparsity)); + nnzB_armpl_ = B_nnz_ = 1 + (uint64_t)((double)total_elements_B * (1.0 - sparsity)); + + initInputMatrices(); + } + +protected: + void toSparseFormat() override { + A_vals_ = (T*)calloc(nnzA_armpl_, sizeof(T)); + A_cols_ = (armpl_int_t*)calloc(nnzA_armpl_, sizeof(armpl_int_t)); + A_rows_ = (armpl_int_t*)calloc(m_armpl_ + 1, sizeof(armpl_int_t)); + + B_vals_ = (T*)calloc(nnzB_armpl_, sizeof(T)); + B_cols_ = (armpl_int_t*)calloc(nnzB_armpl_, sizeof(armpl_int_t)); + B_rows_ = (armpl_int_t*)calloc(k_armpl_ + 1, sizeof(armpl_int_t)); + + int seedOffset = 0; + do { + if (type_ == matrixType::rmat) { + rMatCSR(A_vals_, A_cols_, A_rows_, m_armpl_, k_armpl_, nnzA_armpl_, SEED + seedOffset++); + rMatCSR(B_vals_, B_cols_, B_rows_, k_armpl_, n_armpl_, nnzB_armpl_, SEED + seedOffset++); + } else if (type_ == matrixType::random) { + randomCSR(A_vals_, A_cols_, A_rows_, m_armpl_, k_armpl_, nnzA_armpl_, SEED + seedOffset++); + randomCSR(B_vals_, B_cols_, B_rows_, k_armpl_, n_armpl_, nnzB_armpl_, SEED + seedOffset++); + } else if (type_ == matrixType::finiteElements) { + finiteElementCSR(A_vals_, A_cols_, A_rows_, m_armpl_, k_armpl_, nnzA_armpl_, SEED + seedOffset++); + finiteElementCSR(B_vals_, B_cols_, B_rows_, k_armpl_, n_armpl_, nnzB_armpl_, SEED + seedOffset++); + } else { + std::cerr << "Matrix type not supported" << std::endl; + exit(1); + } + } while (calcCNNZ(m_, A_nnz_, A_rows_, A_cols_, k_, B_nnz_, B_rows_, B_cols_) == 0); + + // Now make the sparse matrix objects + if constexpr (std::is_same_v) { + status_ = armpl_spmat_create_csr_s(&A_armpl_, + m_armpl_, + k_armpl_, + A_rows_, + A_cols_, + A_vals_, + 0); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_create_csr_s(&B_armpl_, + k_armpl_, + n_armpl_, + B_rows_, + B_cols_, + B_vals_, + 0); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + } else if constexpr (std::is_same_v) { + status_ = armpl_spmat_create_csr_d(&A_armpl_, + m_armpl_, + k_armpl_, + A_rows_, + A_cols_, + A_vals_, + 0); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_create_csr_d(&B_armpl_, + k_armpl_, + n_armpl_, + B_rows_, + B_cols_, + B_vals_, + 0); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + } + C_armpl_ = armpl_spmat_create_null(m_armpl_, n_armpl_); + } + +private: + void preLoopRequirements() override { + // Populate A and B with hints + status_ = armpl_spmat_hint(A_armpl_, + ARMPL_SPARSE_HINT_MEMORY, + ARMPL_SPARSE_MEMORY_NOALLOCS); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(A_armpl_, + ARMPL_SPARSE_HINT_STRUCTURE, + ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(A_armpl_, + ARMPL_SPARSE_HINT_SPMM_INVOCATIONS, + ARMPL_SPARSE_INVOCATIONS_FEW); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(A_armpl_, + ARMPL_SPARSE_HINT_SPMM_OPERATION, + ARMPL_SPARSE_OPERATION_NOTRANS); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(A_armpl_, + ARMPL_SPARSE_HINT_SPMM_STRATEGY, + ARMPL_SPARSE_SPMM_STRAT_OPT_FULL_STRUCT); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + status_ = armpl_spmat_hint(B_armpl_, + ARMPL_SPARSE_HINT_MEMORY, + ARMPL_SPARSE_MEMORY_NOALLOCS); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(B_armpl_, + ARMPL_SPARSE_HINT_STRUCTURE, + ARMPL_SPARSE_STRUCTURE_UNSTRUCTURED); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(B_armpl_, + ARMPL_SPARSE_HINT_SPMM_INVOCATIONS, + ARMPL_SPARSE_INVOCATIONS_FEW); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(B_armpl_, + ARMPL_SPARSE_HINT_SPMM_OPERATION, + ARMPL_SPARSE_OPERATION_NOTRANS); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(B_armpl_, + ARMPL_SPARSE_HINT_SPMM_STRATEGY, + ARMPL_SPARSE_SPMM_STRAT_OPT_FULL_STRUCT); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_hint(B_armpl_, + ARMPL_SPARSE_HINT_SPMM_STRATEGY, + ARMPL_SPARSE_SPMM_STRAT_OPT_FULL_STRUCT); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + // Call the optimise function to apply hints + status_ = armpl_spmm_optimize(ARMPL_SPARSE_OPERATION_NOTRANS, + ARMPL_SPARSE_OPERATION_NOTRANS, + ARMPL_SPARSE_SCALAR_ONE, + A_armpl_, + B_armpl_, + ARMPL_SPARSE_SCALAR_ZERO, + C_armpl_); + } + + void callSpmspm() override{ + if constexpr (std::is_same_v) { + status_ = armpl_spmm_exec_s(ARMPL_SPARSE_OPERATION_NOTRANS, + ARMPL_SPARSE_OPERATION_NOTRANS, + alpha, + A_armpl_, + B_armpl_, + beta, + C_armpl_); + } else if constexpr (std::is_same_v) { + status_ = armpl_spmm_exec_d(ARMPL_SPARSE_OPERATION_NOTRANS, + ARMPL_SPARSE_OPERATION_NOTRANS, + alpha, + A_armpl_, + B_armpl_, + beta, + C_armpl_); + } else { + // Un-specialised class will not do any work - print error and exit. + std::cout << "ERROR - Datatype for ArmPL CPU SpMSpM kernel not supported." << std::endl; + exit(1); + } + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cerr << "ERROR: " << status_ << std::endl; + exit(1); + } + } + + void postLoopRequirements() override { + // Export the C arrays from the structure + if constexpr (std::is_same_v) { + status_ = armpl_spmat_export_csr_s(C_armpl_, + 0, + &m_armpl_, + &n_armpl_, + &C_rows_, + &C_cols_, + &C_vals_); + } else if constexpr (std::is_same_v) { + status_ = armpl_spmat_export_csr_d(C_armpl_, + 0, + &m_armpl_, + &n_armpl_, + &C_rows_, + &C_cols_, + &C_vals_); + } else { + // Un-specialised class will not do any work - print error and exit. + std::cout << "ERROR - Datatype for ArmPL CPU SpMSpM kernel not supported." << std::endl; + exit(1); + } + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cerr << "ERROR: " << status_ << std::endl; + exit(1); + } + C_nnz_ = nnzC_armpl_ = C_rows_[m_armpl_]; + + // ARMPL does not seem to enforce ordered column indices in its + // output matrices. Therefore, to allow the checksum to take place + // We have to order the output matrix here. + for (int i = 0; i < m_; i++) { + int start = C_rows_[i]; + int end = C_rows_[i + 1]; + int len = end - start; + if (len > 1) { + std::vector> row_entries(len); + for (int j = 0; j < len; j++) { + row_entries[j] = {C_cols_[start + j], C_vals_[start + j]}; + } + + std::sort(row_entries.begin(), row_entries.end(), + [](const auto &a, const auto &b) { return a.first < b.first; }); + + for (int j = 0; j < len; j++) { + C_cols_[start + j] = row_entries[j].first; + C_vals_[start + j] = row_entries[j].second; + } + } + } + } + + void postCallKernelCleanup() override { + status_ = armpl_spmat_destroy(A_armpl_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_destroy(B_armpl_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = armpl_spmat_destroy(C_armpl_); + if (status_ != ARMPL_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + + free(A_rows_); + free(A_cols_); + free(A_vals_); + free(B_rows_); + free(B_cols_); + free(B_vals_); + free(C_rows_); + free(C_cols_); + free(C_vals_); + } + + const T alpha = ALPHA; + const T beta = BETA; + + + armpl_status_t status_; + + armpl_int_t n_armpl_; + armpl_int_t m_armpl_; + armpl_int_t k_armpl_; + armpl_int_t nnzA_armpl_; + armpl_int_t nnzB_armpl_; + armpl_int_t nnzC_armpl_; + + armpl_int_t* A_cols_; + armpl_int_t* A_rows_; + T* A_vals_; + + armpl_int_t* B_rows_; + armpl_int_t* B_cols_; + T* B_vals_; + + armpl_int_t* C_rows_; + armpl_int_t* C_cols_; + // No C_vals_ needed as inheriting from + // parent in order to allow result check to carry out + + armpl_spmat_t A_armpl_; + armpl_spmat_t B_armpl_; + armpl_spmat_t C_armpl_; + +}; +} // namespace cpu +#endif \ No newline at end of file diff --git a/Makefile b/Makefile index 5dd2fc5..d4dbcbe 100644 --- a/Makefile +++ b/Makefile @@ -51,10 +51,10 @@ CXX = $(CXX_$(COMPILER)) CXXFLAGS_ARM = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native CXXFLAGS_CLANG = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native -CXXFLAGS_GNU = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native -CXXFLAGS_INTEL = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native -Wno-tautological-constant-compare +CXXFLAGS_GNU = -std=c++17 -Wall -Wno-deprecated-declarations -Ofast -$(ARCHFLAG)=native +CXXFLAGS_INTEL = -std=c++17 -Wall -O3 -ffast-math -$(ARCHFLAG)=native -Wno-tautological-constant-compare CXXFLAGS_NVIDIA = -std=c++17 -Wall -O3 -fast -$(ARCHFLAG)=native -CXXFLAGS_HIP = -std=c++17 -Wall -Ofast -$(ARCHFLAG)=native +CXXFLAGS_HIP = -std=c++17 -Wall -O3 -ffast-math -$(ARCHFLAG)=native ifndef CXXFLAGS CXXFLAGS = $(CXXFLAGS_$(COMPILER)) @@ -98,16 +98,16 @@ $(error Must add `MKLROOT=/path/to/mkl/` to make command to use OneMKL CPU Libra endif # Add INTEL compiler options ifeq ($(COMPILER), INTEL) -override CXXFLAGS += -L$(MKLROOT)/lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm -ldl -qmkl=parallel -DMKL_INT=int +override CXXFLAGS += -L$(MKLROOT)/lib/intel64 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm -ldl -qmkl=parallel -DMKL_INT=int # Add GNU compiler options else ifeq ($(COMPILER), GNU) -override CXXFLAGS += -m64 -L$(MKLROOT)/lib -Wl,--no-as-needed -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm -ldl -I"${MKLROOT}/include" -DMKL_INT=int +override CXXFLAGS += -m64 -L$(MKLROOT)/lib/intel64 -Wl,--no-as-needed -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm -ldl -I"${MKLROOT}/include" -DMKL_INT=int $(warning Users may be required to do the following to use $(COMPILER) with $(CPU_LIB):) $(info $(TAB)$(TAB)Add `/lib` to `$$LD_LIBRARY_PATH`) $(info ) # Add CLANG compiler options else ifeq ($(COMPILER), CLANG) -override CXXFLAGS += -L$(MKLROOT)/lib -Wl,--no-as-needed -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm -ldl -m64 -I"${MKLROOT}/include" -DMKL_INT=int +override CXXFLAGS += -L$(MKLROOT)/lib/intel64 -Wl,--no-as-needed -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm -ldl -m64 -I"${MKLROOT}/include" -DMKL_INT=int $(warning Users may be required to do the following to use $(COMPILER) with $(CPU_LIB):) $(info $(TAB)$(TAB)Add `/lib` to `$$LD_LIBRARY_PATH`) $(info ) @@ -118,10 +118,11 @@ endif HEADER_FILES+= $(wildcard oneMKL/CPU/*.hh) else ifeq ($(CPU_LIB), AOCL) +override CXXFLAGS += -laoclutils -lblis -lflame -laoclsparse ifeq ($(COMPILER), INTEL) -override CXXFLAGS += -lblis-mt -qopenmp +override CXXFLAGS += -qopenmp else -override CXXFLAGS += -lblis-mt -fopenmp +override CXXFLAGS += -fopenmp endif $(warning Users may be required to do the following to use $(COMPILER) with $(CPU_LIB):) $(info $(TAB)$(TAB)Add `CXXFLAGS="-L/lib -I/include/blis -Wl,-rpath,/lib"` to make command) @@ -170,14 +171,14 @@ $(warning GPU_LIB not set (use CUBLAS, ONEMKL, ROCBLAS). No GPU kernels will be else ifeq ($(GPU_LIB), CUBLAS) # Do cuBLAS stuff ifeq ($(COMPILER), NVIDIA) -override CXXFLAGS += -cudalib=cublas +override CXXFLAGS += -cudalib=cublas -lcusparse_static else $(warning Users may be required to do the following to use $(COMPILER) with $(GPU_LIB):) $(info $(TAB)$(TAB)Add `CXXFLAGS=-L/.../math_libs/lib64 -L/.../cuda/lib64` to make command) $(info $(TAB)$(TAB)Add `CXXFLAGS=-I/.../math_libs/include -I/.../cuda/include` to make command) $(info $(TAB)$(TAB)Add `CXXFLAGS=-Wl,-rpath,/.../math_libs/lib64 -Wl,-rpath,/.../cuda/lib64` to make command) $(info ) -override CXXFLAGS += -lcublas -lcudart +override CXXFLAGS += -lcublas -lcudart -lcusparse endif HEADER_FILES += $(wildcard cuBLAS/*.hh) @@ -188,7 +189,7 @@ ifndef MKLROOT $(error Must add `MKLROOT=/path/to/mkl/` to make command to use OneMKL CPU Library) endif # Add compiler and link options -override CXXFLAGS += -fsycl -L$(MKLROOT)/lib -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -lmkl_core -lsycl -lpthread -lm -ldl -fsycl -DMKL_ILP64 -I"$(MKLROOT)/include" +override CXXFLAGS += -fsycl -L$(MKLROOT)/lib/intel64 -lmkl_sycl_blas -lmkl_sycl_sparse -lmkl_intel_lp64 -lmkl_tbb_thread -ltbb -lmkl_core -lsycl -lpthread -lm -ldl -fsycl -DMKL_LP64 -I"$(MKLROOT)/include" # `lmkl_tbb_thread` can replace `lmkl_sequential` $(warning Users may be required to do the following to use $(COMPILER) with $(GPU_LIB):) $(info $(TAB)$(TAB)Add `/lib` to `$$LD_LIBRARY_PATH`) @@ -199,17 +200,17 @@ $(error Selected compiler $(COMPILER) is not currently compatible with oneMKL GP endif else ifeq ($(GPU_LIB), ROCBLAS) -ifeq ($(COMPILER), HIP) +# ifeq ($(COMPILER), HIP) # Do rocBLAS stuff -override CXXFLAGS += -lrocblas -lm -lpthread -D__HIP_PLATFORM_AMD__ +override CXXFLAGS += -lrocblas -lrocsparse -lm -lpthread -D__HIP_PLATFORM_AMD__ $(warning Users may be required to do the following to use $(COMPILER) with $(GPU_LIB):) $(info $(TAB)$(TAB)Add `CXXFLAGS=-L/lib -L/lib` to make command) $(info $(TAB)$(TAB)Add `CXXFLAGS=-I/include -I/include` to make command) $(info $(TAB)$(TAB)Add `CXXFLAGS=-Wl,-rpath,/lib -Wl,-rpath,/lib` to make command) HEADER_FILES += $(wildcard rocBLAS/*.hh) -else -$(error Selected compiler $(COMPILER) is not currently compatible with rocBLAS GPU Library) -endif +# else +# $(error Selected compiler $(COMPILER) is not currently compatible with rocBLAS GPU Library) +# endif else @@ -225,7 +226,7 @@ ifdef GPU_LIB override CXXFLAGS += -DGPU_$(GPU_LIB) endif -LDFLAGS = -lm +LDFLAGS = -lm # ------- @@ -233,11 +234,28 @@ EXE = gpu-blob .PHONY: all $(EXE) clean -all: $(EXE) +all: print $(EXE) + +print: + @echo "COMPILER = $(COMPILER)" + @echo "CXX = $(CXX)" + @echo "CPU_LIB = $(CPU_LIB)" + @echo "GPU_LIB = $(GPU_LIB)" + @echo "CXXFLAGS = $(CXXFLAGS)" + @echo "LDFLAGS = $(LDFLAGS)" + @echo "Full command would be:" + @echo "$(CXX) $(SRC_FILES) $(CXXFLAGS) -Lsrc/Consume -Wl,-rpath,src/Consume -lconsume $(LDFLAGS) -o gpu-blob" + @echo "░░ ░░░ ░░░ ░░░░ ░░░░░░░░ ░░░ ░░░░░░░░░ ░░░ ░░" + @echo "▒ ▒▒▒▒▒▒▒▒ ▒▒▒▒ ▒▒ ▒▒▒▒ ▒▒▒▒▒▒▒▒ ▒▒▒▒ ▒▒ ▒▒▒▒▒▒▒▒ ▒▒▒▒ ▒▒ ▒▒▒▒ ▒" + @echo "▓ ▓▓▓ ▓▓ ▓▓▓ ▓▓▓▓ ▓▓ ▓▓ ▓▓▓ ▓▓▓▓▓▓▓▓ ▓▓▓▓ ▓▓ ▓▓" + @echo "█ ████ ██ ████████ ████ ████████ ████ ██ ████████ ████ ██ ████ █" + @echo "██ ███ █████████ █████████ ███ ███ ███ ██" + $(EXE): src/Consume/consume.c $(SRC_FILES) $(HEADER_FILES) gcc src/Consume/consume.c -fpic -O0 -shared -o src/Consume/libconsume.so - $(CXX) $(SRC_FILES) $(CXXFLAGS) -Lsrc/Consume -Wl,-rpath,src/Consume -lconsume $(LDFLAGS) -o $@ + @echo "Building main executable with $(CXX)" + $(CXX) $(SRC_FILES) -o $@ $(CXXFLAGS) -Lsrc/Consume -Wl,-rpath,src/Consume -lconsume $(LDFLAGS) clean: - rm -f $(EXE) src/Consume/libconsume.so \ No newline at end of file + rm -f $(EXE) src/Consume/libconsume.so diff --git a/NVPL/spmdnm.hh b/NVPL/spmdnm.hh new file mode 100644 index 0000000..1dc2bc0 --- /dev/null +++ b/NVPL/spmdnm.hh @@ -0,0 +1,43 @@ +#pragma once + +#ifdef CPU_NVPL + +#include "../include/kernels/CPU/spmdnm.hh" +#include "../include/utilities.hh" + +namespace cpu { +template +class spmdnm_cpu : public spmdnm { +public: + using spmdnm::spmdnm; + using spmdnm::callConsume; + using spmdnm::initInputMatrices; + using spmdnm::m_; + using spmdnm::n_; + using spmdnm::k_; + using spmdnm::B_; + using spmdnm::C_; + using spmdnm::sparsity_; + using spmdnm::type_; + using spmdnm::nnz_; + using spmdnm::iterations_; + + void initialise(int m, int n, int k, double sparsity, + matrixType type, bool binary = false) {} + +protected: + void toSparseFormat() override {} + +private: + void preLoopRequirements() override {} + + void callSpmdnm() override {} + + void postLoopRequirements() override {} + + void postCallKernelCleanup() override {} +}; +} + + +#endif diff --git a/NVPL/spmdnv.hh b/NVPL/spmdnv.hh new file mode 100644 index 0000000..e3f4353 --- /dev/null +++ b/NVPL/spmdnv.hh @@ -0,0 +1,213 @@ +#pragma once + +#ifdef CPU_NVPL +#include + +#include "../include/kernels/CPU/spmdnv.hh" +#include "../include/utilities.hh" + +namespace cpu { +/** A class for SpMDnV CPU BLAS kernels. */ +template +class spmdnv_cpu : public spmdnv { + public: + using spmdnv::spmdnv; + using spmdnv::callConsume; + using spmdnv::initInputMatrixVector; + using spmdnv::m_; + using spmdnv::n_; + using spmdnv::x_; + using spmdnv::y_; + using spmdnv::sparsity_; + using spmdnv::type_; + using spmdnv::nnz_; + using spmdnv::iterations_; + + void initialise (int m, int n, double sparsity, matrixType type, + bool binary = false) { + m_ = m; + n_ = n; + sparsity_ = sparsity; + type_ = type; + + nnz_ = 1 + (uint64_t)((double)m_ * (double)n_ * (1.0 - sparsity_)); + + if constexpr (std::is_same_v) { + dataType_ = NVPL_SPARSE_R_32F; + } else if constexpr (Std::is_same_v) { + dataType_ = NVPL_SPARSE_R_64F; + } else { + throw std::runtime_error("Only float and double are supported for NVPL."); + } + + x_ = (T*)calloc(n_, sizeof(T)); + y_ = (T*)calloc(m_, sizeof(T)); + z_ = (T*)calloc(m_, sizeof(T)); + + initInputMatrixVector(); + } + +protected: + void toSparseFormat() override { + A_vals_ = (T*)calloc(nnz_, sizeof(T)); + A_cols_ = (int64_t*)calloc(nnz_, sizeof(int64_t)); + A_rows_ = (int64_t*)calloc(m_ + 1, sizeof(int64_t)); + + // Fill the CSR arrays + if (type_ == matrixType::rmat) { + rMatCSR(A_vals_, A_cols_, A_rows_, m_, n_, nnz_); + } else if (type_ == matrixType::random) { + randomCSR(A_vals_, A_cols_, A_rows_, m_, n_, nnz_); + } else if (type_ == matrixType::finiteElements) { + finiteElementCSR(A_vals_, A_cols_, A_rows_, m_, n_, nnz_); + } else { + std::cerr << "Matrix type not supported" << std::endl; + exit(1); + } + + // Make the NVPL descriptors + status_ = nvpl_sparse_create_const_csr(&A_descr_, + m_, + n_, + nnz_, + A_rows_, + A_cols_, + A_vals_, + indexType_, + indexType_, + base_, + dataType_); + if (status_ != NVPL_SPARSE_STATUS_SUCCESS) { + std::cerr << "nvpl_sparse_create_csr failed with error: " << status_ << std::endl; + exit(1); + } + + status_ = nvpl_sparse_create_const_dn_vec(X_descr_, + n_, + x_, + dataType_); + if (status_ != NVPL_SPARSE_STATUS_SUCCESS) { + std::cerr << "nvpl_sparse_create_const_dn_vec failed with error: " << status_ << std::endl; + exit(1); + } + + status_ = nvpl_sparse_create_dn_vec(Y_descr_, + m_, + y_, + dataType_); + if (status_ != NVPL_SPARSE_STATUS_SUCCESS) { + std::cerr << "nvpl_sparse_create_dn_vec failed with error: " << status_ << std::endl; + exit(1); + } + status_ = nvpl_sparse_create_dn_vec(Z_descr_, + m_, + z_, + dataType_); + if (status_ != NVPL_SPARSE_STATUS_SUCCESS) { + std::cerr << "nvpl_sparse_create_dn_vec failed with error: " << status_ << std::endl; + exit(1); + } + } + +private: + void preLoopRequirements() override {} + + void callSpMDnV() override { + size_t bufferSize; + status_ = nvpl_sparse_spmv_buffer_size(handle_, + operation_, + &alpha, + A_descr_, + X_descr_, + &beta, + Z_descr_, + Y_descr_, + dataType_, + algorithm_, + description_, + &bufferSize); + if (status_ != NVPL_SPARSE_STATUS_SUCCESS) { + std::cer << "nvpl_sparse_spmv_buffer_size failed with error: " << status_ << std::endl; + exit(1); + } + + void* externalBuffer = malloc(bufferSize); + status_ = nvpl_sparse_spmv_analysis(handle_, + operation_, + &alpha, + A_descr_, + X_descr_, + &beta, + Z_descr_, + Y_descr_, + dataType_, + algorithm_, + description_, + externalBuffer); + if (status_ != NVPL_SPARSE_STATUS_SUCCESS) { + std::cerr << "nvpl_sparse_spmv_analysis failed with error: " << status_ << std::endl; + exit(1); + } + + status_ = nvpl_sparse_spmv(handle_, + operation_, + &alpha_, + A_descr_, + X_descr_, + &beta, + Z_descr_, + Y_descr_, + dataType_, + algorithm_, + description_); + if (status_ != NVPL_SPARSE_STATUS_SUCCESS) { + std::cerr << "nvpl_sparse_spmv failed with error: " << status_ << std::endl; + exit(1); + } + + free(externalBuffer); + } + + void postLoopRequirements() override {} + + void postCallKernelCleanup() override { + free(x_); + free(y_); + free(z_); + free(A_rows_); + free(A_cols_); + free(A_vals_); + } + + nvpl_sparse_status_t status_; + nvpl_sparse_handle_t handle_; + nvpl_Sparse_spmv_descr_t description_; + + nvpl_sparse_spmv_alg_t algorithm_ = NVPL_SPARSE_SPMV_CSR_ALG1; + nvpl_sparse_operation_t operation_ = NVPL_SPARSE_OPERATION_NON_TRANSPOSE; + nvpl_sparse_data_type_t dataType_; + nvpl_sparse_index_type_t indexType_ = NVPL_SPARSE_INDEX_64I; + nvpl_sparse_index_base_t base_ = NVPL_SPARSE_INDEX_BASE_ZERO; + + // Being a bit weird with the naming here. + // For consistency with the other libraries which don't have a + // seperate addition vector, I'm keeping Y as the output + // vector. Even though the NVPL documentation uses Y for + // the addition vector and Z for the output vector + nvpl_sparse_const_sp_mat_descr_t A_descr_; + nvpl_sparse_const_dn_vec_descr_t X_descr_; + nvpl_sparse_dn_vec_descr_t Z_descr_; + nvpl_sparse_dn_vec_descr_t Y_descr_; + + // Arrays for Matrix A and unused addition vector Z + int64_t* A_vals_; + int64_t* A_cols_; + T* A_vals_; + T* z_; + + + const T alpha = ALPHA; + const T beta = BETA; +}; +} // namespace cpu +#endif \ No newline at end of file diff --git a/NVPL/spmspm.hh b/NVPL/spmspm.hh new file mode 100644 index 0000000..ac63086 --- /dev/null +++ b/NVPL/spmspm.hh @@ -0,0 +1,30 @@ +#pragma once + +#ifdef CPU_NVPL + +#include "../include/kernels/CPU/spmspm.hh" +#include "../include/utilities.hh" + +namespace cpu { +template +class spmspm_cpu : public spmspm { +public: + + void initialise(int m, int n, int k, double sparsity, + matrixType type, bool binary = false) {} + +protected: + void toSparseFormat() override {} + +private: + void preLoopRequirements() override {} + + void callSpmspm() override {} + + void postLoopRequirements() override {} + + void postCallKernelCleanup() override {} +}; +} + +#endif diff --git a/README.md b/README.md index d6f4161..1e6cd5e 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,10 @@ Only when an error occurs will any checksum be displayed to the user. GFLOP/s are calculated using the following Total FLOPs formulas. The compute time excludes any initialisation, but does include any data movement / prefetching to/from the GPU device: - **GEMM** : `FLOPs = (2 * M * N * K) + (b * M * N)` where `b` is `1` if BETA=0 and `3` if BETA=/=0 + - **SPMDNM** : `FLOPs = (2 * N * NNZ)` where NNZ is the number of non-zero values in matrix A + - **SPMSPM** : `FLOPs = (NNZA * NNZB) / K` where NNZA is the number of non-zero values in matrix A and NNZ is the number of non-zero values in matrix B. This is an expectation of the number of flops based on a uniform distribution of non-zero values in the columns of matrix A and the rows of matrix B - **GEMV** : `FLOPs = (2 * M * N) + (b * M)` where `b` is `1` if BETA=0 and `3` if BETA=/=0 + - **SPMDNV** : `FLOPs = (2 * NNZ)` where NNZ is the number of non-zero values in matrix A # Build Options Select the compiler you wish to use. Regardless of choice, `gcc` is required in order to build the `Consume.so` external library. @@ -126,18 +129,22 @@ The kernels listed below are computed by the benchmark for a wide range of probl - FP32, FP64 - Square, short-&-wide, tall-&-thin input sizes - + - Square, short-&-wide, tall-&-thin input sizes + + - SpMSpM + - FP32, FP64 + - Square, short-&-wide, tall-&-thin input sizes ### Level 2 BLAS - GEMV - FP32, FP64 - Square, short-&-wide, tall-&-thin input sizes - + - Square, short-&-wide, tall-&-thin input sizes # Auxiliary Files Additional to the main benchmark, there are two auxiliary python scripts which perform the following: @@ -146,7 +153,6 @@ Additional to the main benchmark, there are two auxiliary python scripts which p # Future Work - - [ ] Add support for Sparce Kernels - [ ] Add FP16/BF16 support for kernels - [ ] Add batched GEMM functions - [ ] Add support for Apple Accelerate diff --git a/calculateOffloadThreshold.py b/calculateOffloadThreshold.py index 38c2646..43028c0 100644 --- a/calculateOffloadThreshold.py +++ b/calculateOffloadThreshold.py @@ -165,7 +165,7 @@ def printResults(once:offloadThreshold, always:offloadThreshold, unified:offload gpuAlways.M = 0 gpuAlways.N = 0 gpuAlways.K = 0 - if(gpuUnified.M != 0 and float(cpu[8]) >= float(gpuU[8])): + if("gemm" in kernel and gpuUnified.M != 0 and float(cpu[8]) >= float(gpuU[8])): # Do check to see if this is a momentary drop that we should ignore if (prevGpuUgflops <= float(cpu[8])) and (float(gpuLines[2].split(',')[8]) <= float(cpu[8])): gpuUnified.cpuGflops = 0.0 diff --git a/createFlopsPerSizeGraphs.py b/createFlopsPerSizeGraphs.py new file mode 100644 index 0000000..1e50301 --- /dev/null +++ b/createFlopsPerSizeGraphs.py @@ -0,0 +1,988 @@ +import os +import sys +import matplotlib.pyplot as plt + + + +directory = "CSV_Results" +# Get given CSV file directory +if(len(sys.argv) > 1): + directory = sys.argv[1] + +outputDir = "Graphs_" + directory.replace('/', '_') + +# Check if CSV directory exists +path = os.path.join(os.getcwd(), directory) +if(not os.path.isdir(path)): + print("ERROR - {} directory does not exist. Cannot generate any graphs.".format(directory)) + exit(1) + +# Get all filenames +path = os.path.join(os.getcwd(), directory) +filenames = os.listdir(path) + +# Make Graphs directory +graphDir = os.path.join(os.getcwd(), outputDir) +if(not os.path.isdir(graphDir)): + os.mkdir(graphDir) + +# ------------------------------ GEMV Graphs -------------------------------------------- +print("Creating GEMV graphs...") +# Create GEMV graphs +gemvFilenames = [] +for i in range(0, len(filenames)): + if "gemv_" in filenames[i] and "spgemv_" not in filenames[i]: + gemvFilenames.append(filenames[i]) + +### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s +for i in range(0, len(gemvFilenames)): + mn = [] + iters = 0 + kernel = "" + cpu_Gflops = [] + gpuO_Gflops = [] + gpuA_Gflops = [] + gpuU_Gflops = [] + prob_size = [] + + # Open file and get all lines + fName = os.path.join(os.getcwd(), directory, gemvFilenames[i]) + openFile = open(fName, 'r') + lines = openFile.readlines() + lines.pop(0) # Remove headers + if len(lines) == 0 : + continue + + # Get number of iterations performed and kernel name + line1 = lines[0].split(',') + iters = int(line1[7]) + kernel = line1[1] + + # Get gflops (y-axis) and MN values (x-axis) for CPU and all GPU types + for line in lines: + line = line.split(',') + # Get MN + if (len(mn) == 0) or ([line[2], line[3]] not in mn): + mn.append([line[2], line[3]]) + # Get Gflops + gflops = float(line[-1].rstrip()) + size = float(line[5].rstrip()) + if line[0] == "cpu": + cpu_Gflops.append(gflops) + prob_size.append(size) + elif line[0] == "gpu_offloadOnce": + gpuO_Gflops.append(gflops) + elif line[0] == "gpu_offloadAlways": + gpuA_Gflops.append(gflops) + elif line[0] == "gpu_unified": + gpuU_Gflops.append(gflops) + + + # Create x-axis label and tick values + inputTypeStr = "" + x_name = "" + xVals = [] + if "_square_vector_M=N" in gemvFilenames[i]: + x_name = "Value of M, N" + inputTypeStr = "Square x Vector (M=N)" + for j in range(0, len(mn)): + xVals.append(mn[j][0]) + elif "_tall-thin_vector_M=16N" in gemvFilenames[i]: + x_name = "Value of N where M=16N" + inputTypeStr = "Tall-Thin x Vector (M=16N)" + for j in range(0, len(mn)): + xVals.append(mn[j][1]) + elif "_tall-thin_vector_M_N=32" in gemvFilenames[i]: + x_name = "Value of M, where N=32" + inputTypeStr = "Tall-Thin x Vector (M, N=32)" + for j in range(0, len(mn)): + xVals.append(mn[j][0]) + elif "_short-wide_vector_N=16M" in gemvFilenames[i]: + x_name = "Value of M, where N=16M" + inputTypeStr = "Short-Wide x Vector (N=16M)" + for j in range(0, len(mn)): + xVals.append(mn[j][0]) + elif "_short-wide_vector_M=32_N" in gemvFilenames[i]: + x_name = "Value of N, where M=32" + inputTypeStr = "Short-Wide x Vector (M=32, N)" + for j in range(0, len(mn)): + xVals.append(mn[j][1]) + else: + # File not supported so go to next file + continue + + # Create y-axis label & graph title + y_name = "" + title = "" + fp = "" + if kernel == "sgemv" : + fp = "FP32" + elif kernel == "dgemv": + fp = "FP64" + y_name = "{} GFLOP/s".format(fp) + title = "{}GEMV Performance for {} Problems - {} iterations per problem size".format(kernel[0].upper(), inputTypeStr, iters) + + # Make Graph + fig1 = plt.figure(figsize=(28,16)) + ax1 = fig1.add_subplot() + + gpuEnabled = False + if len(cpu_Gflops) > 0: + ax1.plot(xVals, cpu_Gflops, color="#332288", marker=".", label="CPU") + # Plot line at max GFLOP/s + yCoord = round(max(cpu_Gflops),1) + ax1.axhline(yCoord, color='black', linestyle='--') + ax1.text(x=0, y=yCoord, s="Max CPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom') + if len(gpuO_Gflops) > 0: + ax1.plot(xVals, gpuO_Gflops, color="#44AA99", marker="x", label="GPU (Offload Once)") + gpuEnabled = True + if len(gpuA_Gflops) > 0: + ax1.plot(xVals, gpuA_Gflops, color="#CC6677", marker="+", label="GPU (Offload Always)") + gpuEnabled = True + if len(gpuU_Gflops) > 0: + ax1.plot(xVals, gpuU_Gflops, color="#DDCC77", marker=">", label="GPU (Unified Memory)") + gpuEnabled = True + if len(prob_size) > 0: + ax2 = ax1.twinx() + ax2.plot(xVals, prob_size, color="red", linestyle="--", marker="s", label="Problem Size (KiB)") + ax2.set_ylabel("Problem Size (KiB)", color="red", fontsize=14) + ax2.tick_params(axis='y', labelcolor="red") + ax2.set_ylim(min(prob_size) * 0.9, max(prob_size) * 1.1) + lines_1, labels_1 = ax1.get_legend_handles_labels() + lines_2, labels_2 = ax2.get_legend_handles_labels() + ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc="upper left") + + if(gpuEnabled): + yCoord = round(max([max(gpuO_Gflops), max(gpuA_Gflops), max(gpuU_Gflops)]) ,1) + ax1.axhline(yCoord, color='black', linestyle='--') + ax1.text(x=0, y=yCoord, s="Max GPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom') + + # Set X ticks + NUM_TICK = 8 + numXVals = len(xVals) + if numXVals < NUM_TICK: + # Print all labels + plt.xticks(ticks=range(0, numXVals, 1), labels=xVals, fontsize=20) + else: + # Calculate labels + locInterval = int((numXVals) / (NUM_TICK-1)) + tickLocs = [0] + for q in range(1, (NUM_TICK-1)): + tickLocs.append(1 + (locInterval * q)) + tickLocs.append(numXVals - 1) + + labelInterval = int((int(xVals[-1]) - int(xVals[0])) / (NUM_TICK-1)) + tickLabs = [xVals[0]] + for q in range(1, (NUM_TICK-1)): + tickLabs.append(int(xVals[0]) + (labelInterval * q)) + tickLabs.append(int(xVals[-1])) + + plt.xticks(ticks=tickLocs, labels=tickLabs, fontsize=20) + + # Force setting of y-axis labels. If this isn't done then the range is weird... + yLoc, yLab = plt.yticks() + yLoc = yLoc.tolist() + # Remove negative first element of the list + if yLoc[0] != 0: + yLoc = yLoc[1:] + plt.ylim(0, yLoc[-1]) + plt.yticks(ticks=yLoc, fontsize=20) + + plt.margins(x=0.01, y=0.01) + leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18) + for obj in leg.legend_handles: + obj.set_linewidth(3.0) + obj.set_markersize(15.0) + obj.set_markeredgewidth(3.0) + plt.xlabel(x_name, fontsize=20) + plt.ylabel(y_name, fontsize=20) + plt.title(title, fontsize=20) + plt.savefig(fname="{}/{}.pdf".format(graphDir, gemvFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight") + plt.close('all') + print("\tPDF made") + + +print("Finished!") +# --------------------------------------------------------------------------------------- + +# ------------------------------ SpMDnV Graphs -------------------------------------------- +print("Creating SpMDnV graphs...") +# Create GEMV graphs +spmdnvFilenames = [] +for i in range(0, len(filenames)): + if "spmdnv_" in filenames[i]: + spmdnvFilenames.append(filenames[i]) + +### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s +for i in range(0, len(spmdnvFilenames)): + mn = [] + iters = 0 + kernel = "" + cpu_Gflops = [] + gpuO_Gflops = [] + gpuA_Gflops = [] + gpuU_Gflops = [] + prob_size = [] + + # Open file and get all lines + fName = os.path.join(os.getcwd(), directory, spmdnvFilenames[i]) + openFile = open(fName, 'r') + lines = openFile.readlines() + lines.pop(0) # Remove headers + if len(lines) == 0 : + continue + + # Get number of iterations performed and kernel name + line1 = lines[0].split(',') + iters = int(line1[7]) + kernel = line1[1] + + # Get gflops (y-axis) and MN values (x-axis) for CPU and all GPU types + for line in lines: + line = line.split(',') + # Get MN + if (len(mn) == 0) or ([line[2], line[3]] not in mn): + mn.append([line[2], line[3]]) # line[2] = M, line[3] = N + # Get Gflops + gflops = float(line[-1].rstrip()) + size = float(line[5].rstrip()) + if line[0] == "cpu": + cpu_Gflops.append(gflops) + prob_size.append(size) + elif line[0] == "gpu_offloadOnce": + gpuO_Gflops.append(gflops) + elif line[0] == "gpu_offloadAlways": + gpuA_Gflops.append(gflops) + elif line[0] == "gpu_unified": + gpuU_Gflops.append(gflops) + + + # Create x-axis label and tick values + inputTypeStr = "" + x_name = "" + xVals = [] + if "_square_vector_M=N" in spmdnvFilenames[i]: + x_name = "Value of M, N" + inputTypeStr = "Square x Vector (M=N)" + for j in range(0, len(mn)): + xVals.append(mn[j][0]) + elif "_tall-thin_vector_M=16N" in spmdnvFilenames[i]: + x_name = "Value of N where M=16N" + inputTypeStr = "Tall-Thin x Vector (M=16N)" + for j in range(0, len(mn)): + xVals.append(mn[j][1]) + elif "_tall-thin_vector_M_N=32" in spmdnvFilenames[i]: + x_name = "Value of M, where N=32" + inputTypeStr = "Tall-Thin x Vector (M, N=32)" + for j in range(0, len(mn)): + xVals.append(mn[j][0]) + elif "_short-wide_vector_N=16M" in spmdnvFilenames[i]: + x_name = "Value of M, where N=16M" + inputTypeStr = "Short-Wide x Vector (N=16M)" + for j in range(0, len(mn)): + xVals.append(mn[j][0]) + elif "_short-wide_vector_M=32_N" in spmdnvFilenames[i]: + x_name = "Value of N, where M=32" + inputTypeStr = "Short-Wide x Vector (M=32, N)" + for j in range(0, len(mn)): + xVals.append(mn[j][1]) + else: + # File not supported so go to next file + continue + + # Create y-axis label & graph title + y_name = "" + title = "" + fp = "" + if kernel == "sspmdnv" : + fp = "FP32" + elif kernel == "dspmdnv": + fp = "FP64" + y_name = "{} GFLOP/s".format(fp) + title = "{}SpMDnV Performance for {} Problems - {} iterations per problem size".format(kernel[0].upper(), inputTypeStr, iters) + + # Make Graph + fig1 = plt.figure(figsize=(28,16)) + ax1 = fig1.add_subplot() + + gpuEnabled = False + if len(cpu_Gflops) > 0: + ax1.plot(xVals, cpu_Gflops, color="#332288", marker=".", label="CPU") + # Plot line at max GFLOP/s + yCoord = round(max(cpu_Gflops),1) + ax1.axhline(yCoord, color='black', linestyle='--') + ax1.text(x=0, y=yCoord, s="Max CPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom') + if len(gpuO_Gflops) > 0: + ax1.plot(xVals, gpuO_Gflops, color="#44AA99", marker="x", label="GPU (Offload Once)") + gpuEnabled = True + if len(gpuA_Gflops) > 0: + ax1.plot(xVals, gpuA_Gflops, color="#CC6677", marker="+", label="GPU (Offload Always)") + gpuEnabled = True + if len(gpuU_Gflops) > 0: + ax1.plot(xVals, gpuU_Gflops, color="#DDCC77", marker=">", label="GPU (Unified Memory)") + gpuEnabled = True + if len(prob_size) > 0: + ax2 = ax1.twinx() + ax2.plot(xVals, prob_size, color="red", linestyle="--", marker="s", label="Problem Size (KiB)") + ax2.set_ylabel("Problem Size (KiB)", color="red", fontsize=14) + ax2.tick_params(axis='y', labelcolor="red") + ax2.set_ylim(min(prob_size) * 0.9, max(prob_size) * 1.1) + lines_1, labels_1 = ax1.get_legend_handles_labels() + lines_2, labels_2 = ax2.get_legend_handles_labels() + ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc="upper left") + + if(gpuEnabled): + yCoord = round(max([max(gpuO_Gflops), max(gpuA_Gflops), max(gpuU_Gflops)]) ,1) + ax1.axhline(yCoord, color='black', linestyle='--') + ax1.text(x=0, y=yCoord, s="Max GPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom') + + # Set X ticks + NUM_TICK = 8 + numXVals = len(xVals) + if numXVals < NUM_TICK: + # Print all labels + plt.xticks(ticks=range(0, numXVals, 1), labels=xVals, fontsize=20) + else: + # Calculate labels + locInterval = int((numXVals) / (NUM_TICK-1)) + tickLocs = [0] + for q in range(1, (NUM_TICK-1)): + tickLocs.append(1 + (locInterval * q)) + tickLocs.append(numXVals - 1) + + labelInterval = int((int(xVals[-1]) - int(xVals[0])) / (NUM_TICK-1)) + tickLabs = [xVals[0]] + for q in range(1, (NUM_TICK-1)): + tickLabs.append(int(xVals[0]) + (labelInterval * q)) + tickLabs.append(int(xVals[-1])) + + plt.xticks(ticks=tickLocs, labels=tickLabs, fontsize=20) + + # Force setting of y-axis labels. If this isn't done then the range is weird... + yLoc, yLab = plt.yticks() + yLoc = yLoc.tolist() + # Remove negative first element of the list + if yLoc[0] != 0: + yLoc = yLoc[1:] + plt.ylim(0, yLoc[-1]) + plt.yticks(ticks=yLoc, fontsize=20) + + plt.margins(x=0.01, y=0.01) + leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18) + for obj in leg.legend_handles: + obj.set_linewidth(3.0) + obj.set_markersize(15.0) + obj.set_markeredgewidth(3.0) + plt.xlabel(x_name, fontsize=20) + plt.ylabel(y_name, fontsize=20) + plt.title(title, fontsize=20) + plt.savefig(fname="{}/{}.pdf".format(graphDir, spmdnvFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight") + plt.close('all') + print("\tPDF made") + + +print("Finished!") +# --------------------------------------------------------------------------------------- + +# ------------------------------ GEMM Graphs -------------------------------------------- +print("Creating GEMM graphs...") +# Create GEMM graphs +gemmFilenames = [] +for i in range(0, len(filenames)): + if "gemm_" in filenames[i] and "spgemm_" not in filenames[i]: + gemmFilenames.append(filenames[i]) + +### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s +for i in range(0, len(gemmFilenames)): + mnk = [] + iters = 0 + kernel = "" + cpu_Gflops = [] + gpuO_Gflops = [] + gpuA_Gflops = [] + gpuU_Gflops = [] + prob_size = [] + + # Open file and get all lines + fName = os.path.join(os.getcwd(), directory, gemmFilenames[i]) + openFile = open(fName, 'r') + lines = openFile.readlines() + lines.pop(0) # Remove headers + if len(lines) == 0 : + continue + + # Get number of iterations performed and kernel name + line1 = lines[0].split(',') + sparsity = float(line1[6]) + iters = int(line1[7]) + kernel = line1[1] + + # Get gflops (y-axis) and MNK values (x-axis) for CPU and all GPU types + for line in lines: + line = line.split(',') + # Get MNK + if (len(mnk) == 0) or ([line[2], line[3], line[4]] not in mnk): + mnk.append([line[2], line[3], line[4]]) + # Get Gflops + gflops = float(line[-1].rstrip()) + size = float(line[5].rstrip()) + if line[0] == "cpu": + cpu_Gflops.append(gflops) + prob_size.append(size) + elif line[0] == "gpu_offloadOnce": + gpuO_Gflops.append(gflops) + elif line[0] == "gpu_offloadAlways": + gpuA_Gflops.append(gflops) + elif line[0] == "gpu_unified": + gpuU_Gflops.append(gflops) + + # Create x-axis label and tick values + inputTypeStr = "" + x_name = "" + xVals = [] + if "_square_square_M=N=K" in gemmFilenames[i]: + x_name = "Value of M, N, K" + inputTypeStr = "Square x Square (M=N=K)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][0]) + elif "_tall-thin_short-wide_M=N_M=16K" in gemmFilenames[i]: + x_name = "Value of K where M=16K and N=16K" + inputTypeStr = "Tall-Thin x Short-Wide (M=N=16K)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][2]) + elif "_tall-thin_short-wide_M=N_K=32" in gemmFilenames[i]: + x_name = "Value of M and N, where K=32" + inputTypeStr = "Tall-Thin x Short-Wide (M=N, K=32)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][0]) + elif "_short-wide_tall-thin_M=N_K=16M" in gemmFilenames[i]: + x_name = "Value of M and N, where K=16M" + inputTypeStr = "Short-Wide x Tall-Thin (M=N, K=16M)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][0]) + elif "_short-wide_tall-thin_M=N=32_K" in gemmFilenames[i]: + x_name = "Value of K, where M=32 and N=32" + inputTypeStr = "Short-Wide x Tall-Thin (M=N=32, K)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][2]) + elif "_tall-thin_square_K=N_M=16K" in gemmFilenames[i]: + x_name = "Value of N and K, where M=16K" + inputTypeStr = "Tall-Thin x Square (N=K, M=16K)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][2]) + elif "_tall-thin_square_K=N=32_M" in gemmFilenames[i]: + x_name = "Value of M, where N=32 and K=32" + inputTypeStr = "Tall-Thin x Square (M, N=K=32)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][0]) + elif "_square_short-wide_M=K_N=16K" in gemmFilenames[i]: + x_name = "Value of M and K, where N=16K" + inputTypeStr = "Square x Short-Wide (M=K, N=16K)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][0]) + elif "_square_short-wide_M=K=32_N" in gemmFilenames[i]: + x_name = "Value of N, where M=32 and K=32" + inputTypeStr = "Square x Short-Wide (M=K=32, N)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][1]) + else: + # File not supported so go to next file + continue + + # Create y-axis label & graph title + y_name = "" + title = "" + fp = "" + if kernel == "sgemm" : + fp = "FP32" + elif kernel == "dgemm": + fp = "FP64" + y_name = "{} GFLOP/s".format(fp) + title = ("{}GEMM Performance for {} Problems (sparsity = {})- {} " + "iterations per problemize").format(kernel[0].upper(), + inputTypeStr, sparsity, iters) + + # Make Graph + fig1 = plt.figure(figsize=(28,16)) + ax1 = fig1.add_subplot() + + gpuEnabled = False + if len(cpu_Gflops) > 0: + ax1.plot(xVals, cpu_Gflops, color="#332288", marker=".", label="CPU") + # Plot line at max GFLOP/s + yCoord = round(max(cpu_Gflops),1) + ax1.axhline(yCoord, color='black', linestyle='--') + ax1.text(x=0, y=yCoord, s="Max CPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom') + if len(gpuO_Gflops) > 0: + ax1.plot(xVals, gpuO_Gflops, color="#44AA99", marker="x", label="GPU (Offload Once)") + gpuEnabled = True + if len(gpuA_Gflops) > 0: + ax1.plot(xVals, gpuA_Gflops, color="#CC6677", marker="+", label="GPU (Offload Always)") + gpuEnabled = True + if len(gpuU_Gflops) > 0: + ax1.plot(xVals, gpuU_Gflops, color="#DDCC77", marker=">", label="GPU (Unified Memory)") + gpuEnabled = True + if len(prob_size) > 0: + ax2 = ax1.twinx() + ax2.plot(xVals, prob_size, color="red", linestyle="--", marker="s", label="Problem Size (KiB)") + ax2.set_ylabel("Problem Size (KiB)", color="red", fontsize=14) + ax2.tick_params(axis='y', labelcolor="red") + ax2.set_ylim(min(prob_size) * 0.9, max(prob_size) * 1.1) + lines_1, labels_1 = ax1.get_legend_handles_labels() + lines_2, labels_2 = ax2.get_legend_handles_labels() + ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc="upper left") + + if(gpuEnabled): + yCoord = round(max([max(gpuO_Gflops), max(gpuA_Gflops), max(gpuU_Gflops)]) ,1) + ax1.axhline(yCoord, color='black', linestyle='--') + ax1.text(x=0, y=yCoord, s="Max GPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom') + + # Set X ticks + NUM_TICK = 8 + numXVals = len(xVals) + if numXVals < NUM_TICK: + # Print all labels + plt.xticks(ticks=range(0, numXVals, 1), labels=xVals, fontsize=20) + else: + # Calculate labels + locInterval = int((numXVals) / (NUM_TICK-1)) + tickLocs = [0] + for q in range(1, (NUM_TICK-1)): + tickLocs.append(1 + (locInterval * q)) + tickLocs.append(numXVals - 1) + + labelInterval = int((int(xVals[-1]) - int(xVals[0])) / (NUM_TICK-1)) + tickLabs = [xVals[0]] + for q in range(1, (NUM_TICK-1)): + tickLabs.append(int(xVals[0]) + (labelInterval * q)) + tickLabs.append(int(xVals[-1])) + + plt.xticks(ticks=tickLocs, labels=tickLabs, fontsize=20) + + # Force setting of y-axis labels. If this isn't done then the range is weird... + yLoc, yLab = plt.yticks() + yLoc = yLoc.tolist() + # Remove negative first element of the list + if yLoc[0] != 0: + yLoc = yLoc[1:] + plt.ylim(0, yLoc[-1]) + plt.yticks(ticks=yLoc, fontsize=20) + + plt.margins(x=0.01, y=0.01) + leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18) + for obj in leg.legend_handles: + obj.set_linewidth(3.0) + obj.set_markersize(15.0) + obj.set_markeredgewidth(3.0) + plt.xlabel(x_name, fontsize=20) + plt.ylabel(y_name, fontsize=20) + plt.title(title, fontsize=20) + plt.savefig(fname="{}/{}.pdf".format(graphDir, gemmFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight") + plt.close('all') + print("\tPDF made") + + +print("Finished!") +# --------------------------------------------------------------------------------------- + +# ------------------------------ SpGEMM Graphs -------------------------------------------- +print("Creating SpMDnM graphs...") +# Create SpMDnM graphs +spmdnmFilenames = [] +for i in range(0, len(filenames)): + if "spmdnm_" in filenames[i]: + spmdnmFilenames.append(filenames[i]) + +### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s +for i in range(0, len(spmdnmFilenames)): + mnk = [] + iters = 0 + kernel = "" + cpu_Gflops = [] + gpuO_Gflops = [] + gpuA_Gflops = [] + gpuU_Gflops = [] + prob_size = [] + + # Open file and get all lines + fName = os.path.join(os.getcwd(), directory, spmdnmFilenames[i]) + openFile = open(fName, 'r') + lines = openFile.readlines() + lines.pop(0) # Remove headers + if len(lines) == 0 : + continue + + # Get number of iterations performed and kernel name + line1 = lines[0].split(',') + sparsity = float(line1[6]) + iters = int(line1[7]) + kernel = line1[1] + + # Get gflops (y-axis) and MNK values (x-axis) for CPU and all GPU types + for line in lines: + line = line.split(',') + # Get MNK + if (len(mnk) == 0) or ([line[2], line[3], line[4]] not in mnk): + mnk.append([line[2], line[3], line[4]]) + # Get Gflops + gflops = float(line[-1].rstrip()) + size = float(line[5].rstrip()) + if line[0] == "cpu": + cpu_Gflops.append(gflops) + prob_size.append(size) + elif line[0] == "gpu_offloadOnce": + gpuO_Gflops.append(gflops) + elif line[0] == "gpu_offloadAlways": + gpuA_Gflops.append(gflops) + elif line[0] == "gpu_unified": + gpuU_Gflops.append(gflops) + + # Create x-axis label and tick values + inputTypeStr = "" + x_name = "" + xVals = [] + if "_square_square_M=N=K" in spmdnmFilenames[i]: + x_name = "Value of M, N, K" + inputTypeStr = "Square x Square (M=N=K)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][0]) + elif "_tall-thin_short-wide_M=N_M=16K" in spmdnmFilenames[i]: + x_name = "Value of K where M=16K and N=16K" + inputTypeStr = "Tall-Thin x Short-Wide (M=N=16K)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][2]) + elif "_tall-thin_short-wide_M=N_K=32" in spmdnmFilenames[i]: + x_name = "Value of M and N, where K=32" + inputTypeStr = "Tall-Thin x Short-Wide (M=N, K=32)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][0]) + elif "_short-wide_tall-thin_M=N_K=16M" in spmdnmFilenames[i]: + x_name = "Value of M and N, where K=16M" + inputTypeStr = "Short-Wide x Tall-Thin (M=N, K=16M)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][0]) + elif "_short-wide_tall-thin_M=N=32_K" in spmdnmFilenames[i]: + x_name = "Value of K, where M=32 and N=32" + inputTypeStr = "Short-Wide x Tall-Thin (M=N=32, K)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][2]) + elif "_tall-thin_square_K=N_M=16K" in spmdnmFilenames[i]: + x_name = "Value of N and K, where M=16K" + inputTypeStr = "Tall-Thin x Square (N=K, M=16K)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][2]) + elif "_tall-thin_square_K=N=32_M" in spmdnmFilenames[i]: + x_name = "Value of M, where N=32 and K=32" + inputTypeStr = "Tall-Thin x Square (M, N=K=32)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][0]) + elif "_square_short-wide_M=K_N=16K" in spmdnmFilenames[i]: + x_name = "Value of M and K, where N=16K" + inputTypeStr = "Square x Short-Wide (M=K, N=16K)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][0]) + elif "_square_short-wide_M=K=32_N" in spmdnmFilenames[i]: + x_name = "Value of N, where M=32 and K=32" + inputTypeStr = "Square x Short-Wide (M=K=32, N)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][1]) + else: + # File not supported so go to next file + continue + + # Create y-axis label & graph title + y_name = "" + title = "" + fp = "" + if kernel == "sspmdnm" : + fp = "FP32" + elif kernel == "dspmdnm": + fp = "FP64" + y_name = "{} GFLOP/s".format(fp) + title = ("{}SpMDnM Performance for {} Problems (sparsity = {})- {} " + "iterations per problemize").format(kernel[0].upper(), + inputTypeStr, sparsity, iters) + + # Make Graph + fig1 = plt.figure(figsize=(28,16)) + ax1 = fig1.add_subplot() + + gpuEnabled = False + if len(cpu_Gflops) > 0: + ax1.plot(xVals, cpu_Gflops, color="#332288", marker=".", label="CPU") + # Plot line at max GFLOP/s + yCoord = round(max(cpu_Gflops),1) + ax1.axhline(yCoord, color='black', linestyle='--') + ax1.text(x=0, y=yCoord, s="Max CPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom') + if len(gpuO_Gflops) > 0: + ax1.plot(xVals, gpuO_Gflops, color="#44AA99", marker="x", label="GPU (Offload Once)") + gpuEnabled = True + if len(gpuA_Gflops) > 0: + ax1.plot(xVals, gpuA_Gflops, color="#CC6677", marker="+", label="GPU (Offload Always)") + gpuEnabled = True + if len(gpuU_Gflops) > 0: + ax1.plot(xVals, gpuU_Gflops, color="#DDCC77", marker=">", label="GPU (Unified Memory)") + gpuEnabled = True + if len(prob_size) > 0: + ax2 = ax1.twinx() + ax2.plot(xVals, prob_size, color="red", linestyle="--", marker="s", label="Problem Size (KiB)") + ax2.set_ylabel("Problem Size (KiB)", color="red", fontsize=14) + ax2.tick_params(axis='y', labelcolor="red") + ax2.set_ylim(min(prob_size) * 0.9, max(prob_size) * 1.1) + lines_1, labels_1 = ax1.get_legend_handles_labels() + lines_2, labels_2 = ax2.get_legend_handles_labels() + ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc="upper left") + + if(gpuEnabled): + yCoord = round(max([max(gpuO_Gflops), max(gpuA_Gflops), max(gpuU_Gflops)]) ,1) + ax1.axhline(yCoord, color='black', linestyle='--') + ax1.text(x=0, y=yCoord, s="Max GPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom') + + # Set X ticks + NUM_TICK = 8 + numXVals = len(xVals) + if numXVals < NUM_TICK: + # Print all labels + plt.xticks(ticks=range(0, numXVals, 1), labels=xVals, fontsize=20) + else: + # Calculate labels + locInterval = int((numXVals) / (NUM_TICK-1)) + tickLocs = [0] + for q in range(1, (NUM_TICK-1)): + tickLocs.append(1 + (locInterval * q)) + tickLocs.append(numXVals - 1) + + labelInterval = int((int(xVals[-1]) - int(xVals[0])) / (NUM_TICK-1)) + tickLabs = [xVals[0]] + for q in range(1, (NUM_TICK-1)): + tickLabs.append(int(xVals[0]) + (labelInterval * q)) + tickLabs.append(int(xVals[-1])) + + plt.xticks(ticks=tickLocs, labels=tickLabs, fontsize=20) + + # Force setting of y-axis labels. If this isn't done then the range is weird... + yLoc, yLab = plt.yticks() + yLoc = yLoc.tolist() + # Remove negative first element of the list + if yLoc[0] != 0: + yLoc = yLoc[1:] + plt.ylim(0, yLoc[-1]) + plt.yticks(ticks=yLoc, fontsize=20) + + plt.margins(x=0.01, y=0.01) + leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18) + for obj in leg.legend_handles: + obj.set_linewidth(3.0) + obj.set_markersize(15.0) + obj.set_markeredgewidth(3.0) + plt.xlabel(x_name, fontsize=20) + plt.ylabel(y_name, fontsize=20) + plt.title(title, fontsize=20) + plt.savefig(fname="{}/{}.pdf".format(graphDir, spmdnmFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight") + plt.close('all') + print("\tPDF made") + + +print("Finished!") +# --------------------------------------------------------------------------------------- + +# ------------------------------ SpMSpM Graphs -------------------------------------------- +print("Creating SpMSpM graphs...") +# Create SpMSpM graphs +spmspmFilenames = [] +for i in range(0, len(filenames)): + if "spmspm_" in filenames[i]: + spmspmFilenames.append(filenames[i]) + +### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s +for i in range(0, len(spmspmFilenames)): + mnk = [] + iters = 0 + kernel = "" + cpu_Gflops = [] + gpuO_Gflops = [] + gpuA_Gflops = [] + gpuU_Gflops = [] + prob_size = [] + + # Open file and get all lines + fName = os.path.join(os.getcwd(), directory, spmspmFilenames[i]) + openFile = open(fName, 'r') + lines = openFile.readlines() + lines.pop(0) # Remove headers + if len(lines) == 0 : + continue + + # Get number of iterations performed and kernel name + line1 = lines[0].split(',') + sparsity = float(line1[6]) + iters = int(line1[7]) + kernel = line1[1] + + # Get gflops (y-axis) and MNK values (x-axis) for CPU and all GPU types + for line in lines: + line = line.split(',') + # Get MNK + if (len(mnk) == 0) or ([line[2], line[3], line[4]] not in mnk): + mnk.append([line[2], line[3], line[4]]) + # Get Gflops + gflops = float(line[-1].rstrip()) + size = float(line[5].rstrip()) + if line[0] == "cpu": + cpu_Gflops.append(gflops) + prob_size.append(size) + elif line[0] == "gpu_offloadOnce": + gpuO_Gflops.append(gflops) + elif line[0] == "gpu_offloadAlways": + gpuA_Gflops.append(gflops) + elif line[0] == "gpu_unified": + gpuU_Gflops.append(gflops) + + # Create x-axis label and tick values + inputTypeStr = "" + x_name = "" + xVals = [] + if "_square_square_M=N=K" in spmspmFilenames[i]: + x_name = "Value of M, N, K" + inputTypeStr = "Square x Square (M=N=K)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][0]) + elif "_tall-thin_short-wide_M=N_M=16K" in spmspmFilenames[i]: + x_name = "Value of K where M=16K and N=16K" + inputTypeStr = "Tall-Thin x Short-Wide (M=N=16K)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][2]) + elif "_tall-thin_short-wide_M=N_K=32" in spmspmFilenames[i]: + x_name = "Value of M and N, where K=32" + inputTypeStr = "Tall-Thin x Short-Wide (M=N, K=32)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][0]) + elif "_short-wide_tall-thin_M=N_K=16M" in spmspmFilenames[i]: + x_name = "Value of M and N, where K=16M" + inputTypeStr = "Short-Wide x Tall-Thin (M=N, K=16M)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][0]) + elif "_short-wide_tall-thin_M=N=32_K" in spmspmFilenames[i]: + x_name = "Value of K, where M=32 and N=32" + inputTypeStr = "Short-Wide x Tall-Thin (M=N=32, K)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][2]) + elif "_tall-thin_square_K=N_M=16K" in spmspmFilenames[i]: + x_name = "Value of N and K, where M=16K" + inputTypeStr = "Tall-Thin x Square (N=K, M=16K)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][2]) + elif "_tall-thin_square_K=N=32_M" in spmspmFilenames[i]: + x_name = "Value of M, where N=32 and K=32" + inputTypeStr = "Tall-Thin x Square (M, N=K=32)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][0]) + elif "_square_short-wide_M=K_N=16K" in spmspmFilenames[i]: + x_name = "Value of M and K, where N=16K" + inputTypeStr = "Square x Short-Wide (M=K, N=16K)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][0]) + elif "_square_short-wide_M=K=32_N" in spmspmFilenames[i]: + x_name = "Value of N, where M=32 and K=32" + inputTypeStr = "Square x Short-Wide (M=K=32, N)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][1]) + else: + # File not supported so go to next file + continue + + # Create y-axis label & graph title + y_name = "" + title = "" + fp = "" + if kernel == "sspmspm" : + fp = "FP32" + elif kernel == "dspmspm": + fp = "FP64" + y_name = "{} GFLOP/s".format(fp) + title = ("{}SpMSpM Performance for {} Problems (sparsity = {})- {} " + "iterations per problemize").format(kernel[0].upper(), + inputTypeStr, sparsity, iters) + + # Make Graph + fig1 = plt.figure(figsize=(28,16)) + ax1 = fig1.add_subplot() + + gpuEnabled = False + if len(cpu_Gflops) > 0: + ax1.plot(xVals, cpu_Gflops, color="#332288", marker=".", label="CPU") + # Plot line at max GFLOP/s + yCoord = round(max(cpu_Gflops),1) + ax1.axhline(yCoord, color='black', linestyle='--') + ax1.text(x=0, y=yCoord, s="Max CPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom') + if len(gpuO_Gflops) > 0: + ax1.plot(xVals, gpuO_Gflops, color="#44AA99", marker="x", label="GPU (Offload Once)") + gpuEnabled = True + if len(gpuA_Gflops) > 0: + ax1.plot(xVals, gpuA_Gflops, color="#CC6677", marker="+", label="GPU (Offload Always)") + gpuEnabled = True + if len(gpuU_Gflops) > 0: + ax1.plot(xVals, gpuU_Gflops, color="#DDCC77", marker=">", label="GPU (Unified Memory)") + gpuEnabled = True + if len(prob_size) > 0: + ax2 = ax1.twinx() + ax2.plot(xVals, prob_size, color="red", linestyle="--", marker="s", label="Problem Size (KiB)") + ax2.set_ylabel("Problem Size (KiB)", color="red", fontsize=14) + ax2.tick_params(axis='y', labelcolor="red") + ax2.set_ylim(min(prob_size) * 0.9, max(prob_size) * 1.1) + lines_1, labels_1 = ax1.get_legend_handles_labels() + lines_2, labels_2 = ax2.get_legend_handles_labels() + ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc="upper left") + + if(gpuEnabled): + yCoord = round(max([max(gpuO_Gflops), max(gpuA_Gflops), max(gpuU_Gflops)]) ,1) + ax1.axhline(yCoord, color='black', linestyle='--') + ax1.text(x=0, y=yCoord, s="Max GPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom') + + # Set X ticks + NUM_TICK = 8 + numXVals = len(xVals) + if numXVals < NUM_TICK: + # Print all labels + plt.xticks(ticks=range(0, numXVals, 1), labels=xVals, fontsize=20) + else: + # Calculate labels + locInterval = int((numXVals) / (NUM_TICK-1)) + tickLocs = [0] + for q in range(1, (NUM_TICK-1)): + tickLocs.append(1 + (locInterval * q)) + tickLocs.append(numXVals - 1) + + labelInterval = int((int(xVals[-1]) - int(xVals[0])) / (NUM_TICK-1)) + tickLabs = [xVals[0]] + for q in range(1, (NUM_TICK-1)): + tickLabs.append(int(xVals[0]) + (labelInterval * q)) + tickLabs.append(int(xVals[-1])) + + plt.xticks(ticks=tickLocs, labels=tickLabs, fontsize=20) + + # Force setting of y-axis labels. If this isn't done then the range is weird... + yLoc, yLab = plt.yticks() + yLoc = yLoc.tolist() + # Remove negative first element of the list + if yLoc[0] != 0: + yLoc = yLoc[1:] + plt.ylim(0, yLoc[-1]) + plt.yticks(ticks=yLoc, fontsize=20) + + plt.margins(x=0.01, y=0.01) + leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18) + for obj in leg.legend_handles: + obj.set_linewidth(3.0) + obj.set_markersize(15.0) + obj.set_markeredgewidth(3.0) + plt.xlabel(x_name, fontsize=20) + plt.ylabel(y_name, fontsize=20) + plt.title(title, fontsize=20) + plt.savefig(fname="{}/{}.pdf".format(graphDir, spmspmFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight") + plt.close('all') + print("\tPDF made") + + +print("Finished!") +# --------------------------------------------------------------------------------------- diff --git a/createGflopsGraphs.py b/createGflopsGraphs.py index 0ed7772..8108812 100644 --- a/createGflopsGraphs.py +++ b/createGflopsGraphs.py @@ -26,12 +26,346 @@ if(not os.path.isdir(graphDir)): os.mkdir(graphDir) +# ------------------------------ GEMV Graphs -------------------------------------------- +print("Creating GEMV graphs...") +# Create GEMV graphs +gemvFilenames = [] +for i in range(0, len(filenames)): + if "gemv_" in filenames[i] and "spgemv_" not in filenames[i]: + gemvFilenames.append(filenames[i]) + +### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s +for i in range(0, len(gemvFilenames)): + mn = [] + iters = 0 + kernel = "" + cpu_Gflops = [] + gpuO_Gflops = [] + gpuA_Gflops = [] + gpuU_Gflops = [] + + # Open file and get all lines + fName = os.path.join(os.getcwd(), directory, gemvFilenames[i]) + openFile = open(fName, 'r') + lines = openFile.readlines() + lines.pop(0) # Remove headers + if len(lines) == 0 : + continue + + # Get number of iterations performed and kernel name + line1 = lines[0].split(',') + iters = int(line1[7]) + kernel = line1[1] + + # Get gflops (y-axis) and MN values (x-axis) for CPU and all GPU types + for line in lines: + line = line.split(',') + # Get MN + if (len(mn) == 0) or ([line[2], line[3]] not in mn): + mn.append([line[2], line[3]]) + # Get Gflops + gflops = float(line[-1].rstrip()) + if line[0] == "cpu": + cpu_Gflops.append(gflops) + elif line[0] == "gpu_offloadOnce": + gpuO_Gflops.append(gflops) + elif line[0] == "gpu_offloadAlways": + gpuA_Gflops.append(gflops) + elif line[0] == "gpu_unified": + gpuU_Gflops.append(gflops) + + + # Create x-axis label and tick values + inputTypeStr = "" + x_name = "" + xVals = [] + if "_square_vector_M=N" in gemvFilenames[i]: + x_name = "Value of M, N" + inputTypeStr = "Square x Vector (M=N)" + for j in range(0, len(mn)): + xVals.append(mn[j][0]) + elif "_tall-thin_vector_M=16N" in gemvFilenames[i]: + x_name = "Value of N where M=16N" + inputTypeStr = "Tall-Thin x Vector (M=16N)" + for j in range(0, len(mn)): + xVals.append(mn[j][1]) + elif "_tall-thin_vector_M_N=32" in gemvFilenames[i]: + x_name = "Value of M, where N=32" + inputTypeStr = "Tall-Thin x Vector (M, N=32)" + for j in range(0, len(mn)): + xVals.append(mn[j][0]) + elif "_short-wide_vector_N=16M" in gemvFilenames[i]: + x_name = "Value of M, where N=16M" + inputTypeStr = "Short-Wide x Vector (N=16M)" + for j in range(0, len(mn)): + xVals.append(mn[j][0]) + elif "_short-wide_vector_M=32_N" in gemvFilenames[i]: + x_name = "Value of N, where M=32" + inputTypeStr = "Short-Wide x Vector (M=32, N)" + for j in range(0, len(mn)): + xVals.append(mn[j][1]) + else: + # File not supported so go to next file + continue + + # Create y-axis label & graph title + y_name = "" + title = "" + fp = "" + if kernel == "sgemv" : + fp = "FP32" + elif kernel == "dgemv": + fp = "FP64" + y_name = "{} GFLOP/s".format(fp) + title = "{}GEMV Performance for {} Problems - {} iterations per problem size".format(kernel[0].upper(), inputTypeStr, iters) + + # Make Graph + fig1 = plt.figure(figsize=(28,16)) + ax1 = fig1.add_subplot() + + gpuEnabled = False + if len(cpu_Gflops) > 0: + ax1.plot(xVals, cpu_Gflops, color="#332288", marker=".", label="CPU") + # Plot line at max GFLOP/s + yCoord = round(max(cpu_Gflops),1) + ax1.axhline(yCoord, color='black', linestyle='--') + ax1.text(x=0, y=yCoord, s="Max CPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom') + if len(gpuO_Gflops) > 0: + ax1.plot(xVals, gpuO_Gflops, color="#44AA99", marker="x", label="GPU (Offload Once)") + gpuEnabled = True + if len(gpuA_Gflops) > 0: + ax1.plot(xVals, gpuA_Gflops, color="#CC6677", marker="+", label="GPU (Offload Always)") + gpuEnabled = True + if len(gpuU_Gflops) > 0: + ax1.plot(xVals, gpuU_Gflops, color="#DDCC77", marker=">", label="GPU (Unified Memory)") + gpuEnabled = True + + if(gpuEnabled): + yCoord = round(max([max(gpuO_Gflops), max(gpuA_Gflops), max(gpuU_Gflops)]) ,1) + ax1.axhline(yCoord, color='black', linestyle='--') + ax1.text(x=0, y=yCoord, s="Max GPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom') + + # Set X ticks + NUM_TICK = 8 + numXVals = len(xVals) + if numXVals < NUM_TICK: + # Print all labels + plt.xticks(ticks=range(0, numXVals, 1), labels=xVals, fontsize=20) + else: + # Calculate labels + locInterval = int((numXVals) / (NUM_TICK-1)) + tickLocs = [0] + for q in range(1, (NUM_TICK-1)): + tickLocs.append(1 + (locInterval * q)) + tickLocs.append(numXVals - 1) + + labelInterval = int((int(xVals[-1]) - int(xVals[0])) / (NUM_TICK-1)) + tickLabs = [xVals[0]] + for q in range(1, (NUM_TICK-1)): + tickLabs.append(int(xVals[0]) + (labelInterval * q)) + tickLabs.append(int(xVals[-1])) + + plt.xticks(ticks=tickLocs, labels=tickLabs, fontsize=20) + + # Force setting of y-axis labels. If this isn't done then the range is weird... + yLoc, yLab = plt.yticks() + yLoc = yLoc.tolist() + # Remove negative first element of the list + if yLoc[0] != 0: + yLoc = yLoc[1:] + plt.ylim(0, yLoc[-1]) + plt.yticks(ticks=yLoc, fontsize=20) + + plt.margins(x=0.01, y=0.01) + leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18) + for obj in leg.legend_handles: + obj.set_linewidth(3.0) + obj.set_markersize(15.0) + obj.set_markeredgewidth(3.0) + plt.xlabel(x_name, fontsize=20) + plt.ylabel(y_name, fontsize=20) + plt.title(title, fontsize=20) + plt.savefig(fname="{}/{}.pdf".format(graphDir, gemvFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight") + plt.close('all') + print("\tPDF made") + + +print("Finished!") +# --------------------------------------------------------------------------------------- + +# ------------------------------ SpMDnV Graphs -------------------------------------------- +print("Creating SpMDnV graphs...") +# Create GEMV graphs +spmdnvFilenames = [] +for i in range(0, len(filenames)): + if "spmdnv_" in filenames[i]: + spmdnvFilenames.append(filenames[i]) + +### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s +for i in range(0, len(spmdnvFilenames)): + mn = [] + iters = 0 + kernel = "" + cpu_Gflops = [] + gpuO_Gflops = [] + gpuA_Gflops = [] + gpuU_Gflops = [] + + # Open file and get all lines + fName = os.path.join(os.getcwd(), directory, spmdnvFilenames[i]) + openFile = open(fName, 'r') + lines = openFile.readlines() + lines.pop(0) # Remove headers + if len(lines) == 0 : + continue + + # Get number of iterations performed and kernel name + line1 = lines[0].split(',') + iters = int(line1[7]) + kernel = line1[1] + + # Get gflops (y-axis) and MN values (x-axis) for CPU and all GPU types + for line in lines: + line = line.split(',') + # Get MN + if (len(mn) == 0) or ([line[2], line[3]] not in mn): + mn.append([line[2], line[3]]) # line[2] = M, line[3] = N + # Get Gflops + gflops = float(line[-1].rstrip()) + if line[0] == "cpu": + cpu_Gflops.append(gflops) + elif line[0] == "gpu_offloadOnce": + gpuO_Gflops.append(gflops) + elif line[0] == "gpu_offloadAlways": + gpuA_Gflops.append(gflops) + elif line[0] == "gpu_unified": + gpuU_Gflops.append(gflops) + + + # Create x-axis label and tick values + inputTypeStr = "" + x_name = "" + xVals = [] + if "_square_vector_M=N" in spmdnvFilenames[i]: + x_name = "Value of M, N" + inputTypeStr = "Square x Vector (M=N)" + for j in range(0, len(mn)): + xVals.append(mn[j][0]) + elif "_tall-thin_vector_M=16N" in spmdnvFilenames[i]: + x_name = "Value of N where M=16N" + inputTypeStr = "Tall-Thin x Vector (M=16N)" + for j in range(0, len(mn)): + xVals.append(mn[j][1]) + elif "_tall-thin_vector_M_N=32" in spmdnvFilenames[i]: + x_name = "Value of M, where N=32" + inputTypeStr = "Tall-Thin x Vector (M, N=32)" + for j in range(0, len(mn)): + xVals.append(mn[j][0]) + elif "_short-wide_vector_N=16M" in spmdnvFilenames[i]: + x_name = "Value of M, where N=16M" + inputTypeStr = "Short-Wide x Vector (N=16M)" + for j in range(0, len(mn)): + xVals.append(mn[j][0]) + elif "_short-wide_vector_M=32_N" in spmdnvFilenames[i]: + x_name = "Value of N, where M=32" + inputTypeStr = "Short-Wide x Vector (M=32, N)" + for j in range(0, len(mn)): + xVals.append(mn[j][1]) + else: + # File not supported so go to next file + continue + + # Create y-axis label & graph title + y_name = "" + title = "" + fp = "" + if kernel == "sspmdnv" : + fp = "FP32" + elif kernel == "dspmdnv": + fp = "FP64" + y_name = "{} GFLOP/s".format(fp) + title = "{}SpMDnV Performance for {} Problems - {} iterations per problem size".format(kernel[0].upper(), inputTypeStr, iters) + + # Make Graph + fig1 = plt.figure(figsize=(28,16)) + ax1 = fig1.add_subplot() + + gpuEnabled = False + if len(cpu_Gflops) > 0: + ax1.plot(xVals, cpu_Gflops, color="#332288", marker=".", label="CPU") + # Plot line at max GFLOP/s + yCoord = round(max(cpu_Gflops),1) + ax1.axhline(yCoord, color='black', linestyle='--') + ax1.text(x=0, y=yCoord, s="Max CPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom') + if len(gpuO_Gflops) > 0: + ax1.plot(xVals, gpuO_Gflops, color="#44AA99", marker="x", label="GPU (Offload Once)") + gpuEnabled = True + if len(gpuA_Gflops) > 0: + ax1.plot(xVals, gpuA_Gflops, color="#CC6677", marker="+", label="GPU (Offload Always)") + gpuEnabled = True + if len(gpuU_Gflops) > 0: + ax1.plot(xVals, gpuU_Gflops, color="#DDCC77", marker=">", label="GPU (Unified Memory)") + gpuEnabled = True + + if(gpuEnabled): + yCoord = round(max([max(gpuO_Gflops), max(gpuA_Gflops), max(gpuU_Gflops)]) ,1) + ax1.axhline(yCoord, color='black', linestyle='--') + ax1.text(x=0, y=yCoord, s="Max GPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom') + + # Set X ticks + NUM_TICK = 8 + numXVals = len(xVals) + if numXVals < NUM_TICK: + # Print all labels + plt.xticks(ticks=range(0, numXVals, 1), labels=xVals, fontsize=20) + else: + # Calculate labels + locInterval = int((numXVals) / (NUM_TICK-1)) + tickLocs = [0] + for q in range(1, (NUM_TICK-1)): + tickLocs.append(1 + (locInterval * q)) + tickLocs.append(numXVals - 1) + + labelInterval = int((int(xVals[-1]) - int(xVals[0])) / (NUM_TICK-1)) + tickLabs = [xVals[0]] + for q in range(1, (NUM_TICK-1)): + tickLabs.append(int(xVals[0]) + (labelInterval * q)) + tickLabs.append(int(xVals[-1])) + + plt.xticks(ticks=tickLocs, labels=tickLabs, fontsize=20) + + # Force setting of y-axis labels. If this isn't done then the range is weird... + yLoc, yLab = plt.yticks() + yLoc = yLoc.tolist() + # Remove negative first element of the list + if yLoc[0] != 0: + yLoc = yLoc[1:] + plt.ylim(0, yLoc[-1]) + plt.yticks(ticks=yLoc, fontsize=20) + + plt.margins(x=0.01, y=0.01) + leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18) + for obj in leg.legend_handles: + obj.set_linewidth(3.0) + obj.set_markersize(15.0) + obj.set_markeredgewidth(3.0) + plt.xlabel(x_name, fontsize=20) + plt.ylabel(y_name, fontsize=20) + plt.title(title, fontsize=20) + plt.savefig(fname="{}/{}.pdf".format(graphDir, spmdnvFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight") + plt.close('all') + print("\tPDF made") + + +print("Finished!") +# --------------------------------------------------------------------------------------- + # ------------------------------ GEMM Graphs -------------------------------------------- print("Creating GEMM graphs...") # Create GEMM graphs gemmFilenames = [] for i in range(0, len(filenames)): - if "gemm_" in filenames[i]: + if "gemm_" in filenames[i] and "spgemm_" not in filenames[i]: gemmFilenames.append(filenames[i]) ### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s @@ -54,7 +388,8 @@ # Get number of iterations performed and kernel name line1 = lines[0].split(',') - iters = int(line1[6]) + sparsity = float(line1[6]) + iters = int(line1[7]) kernel = line1[1] # Get gflops (y-axis) and MNK values (x-axis) for CPU and all GPU types @@ -127,8 +462,6 @@ # File not supported so go to next file continue - - # Create y-axis label & graph title y_name = "" title = "" @@ -138,7 +471,9 @@ elif kernel == "dgemm": fp = "FP64" y_name = "{} GFLOP/s".format(fp) - title = "{}GEMM Performance for {} Problems - {} iterations per problem size".format(kernel[0].upper(), inputTypeStr, iters) + title = ("{}GEMM Performance for {} Problems (sparsity = {})- {} " + "iterations per problemize").format(kernel[0].upper(), + inputTypeStr, sparsity, iters) # Make Graph fig1 = plt.figure(figsize=(28,16)) @@ -199,31 +534,32 @@ plt.margins(x=0.01, y=0.01) leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18) - for obj in leg.legendHandles: + for obj in leg.legend_handles: obj.set_linewidth(3.0) obj.set_markersize(15.0) obj.set_markeredgewidth(3.0) plt.xlabel(x_name, fontsize=20) plt.ylabel(y_name, fontsize=20) plt.title(title, fontsize=20) - plt.savefig(fname="{}/{}.png".format(graphDir, gemmFilenames[i][:-4]), format="png", dpi=100, bbox_inches="tight") + plt.savefig(fname="{}/{}.pdf".format(graphDir, gemmFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight") plt.close('all') + print("\tPDF made") print("Finished!") # --------------------------------------------------------------------------------------- -# ------------------------------ GEMV Graphs -------------------------------------------- -print("Creating GEMV graphs...") -# Create GEMV graphs -gemvFilenames = [] +# ------------------------------ SpGEMM Graphs -------------------------------------------- +print("Creating SpMDnM graphs...") +# Create SpMDnM graphs +spmdnmFilenames = [] for i in range(0, len(filenames)): - if "gemv_" in filenames[i]: - gemvFilenames.append(filenames[i]) + if "spmdnm_" in filenames[i]: + spmdnmFilenames.append(filenames[i]) ### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s -for i in range(0, len(gemvFilenames)): - mn = [] +for i in range(0, len(spmdnmFilenames)): + mnk = [] iters = 0 kernel = "" cpu_Gflops = [] @@ -232,7 +568,7 @@ gpuU_Gflops = [] # Open file and get all lines - fName = os.path.join(os.getcwd(), directory, gemvFilenames[i]) + fName = os.path.join(os.getcwd(), directory, spmdnmFilenames[i]) openFile = open(fName, 'r') lines = openFile.readlines() lines.pop(0) # Remove headers @@ -241,15 +577,16 @@ # Get number of iterations performed and kernel name line1 = lines[0].split(',') - iters = int(line1[6]) + sparsity = float(line1[6]) + iters = int(line1[7]) kernel = line1[1] - # Get gflops (y-axis) and MN values (x-axis) for CPU and all GPU types + # Get gflops (y-axis) and MNK values (x-axis) for CPU and all GPU types for line in lines: line = line.split(',') - # Get MN - if (len(mn) == 0) or ([line[2], line[3]] not in mn): - mn.append([line[2], line[3]]) + # Get MNK + if (len(mnk) == 0) or ([line[2], line[3], line[4]] not in mnk): + mnk.append([line[2], line[3], line[4]]) # Get Gflops gflops = float(line[-1].rstrip()) if line[0] == "cpu": @@ -261,52 +598,260 @@ elif line[0] == "gpu_unified": gpuU_Gflops.append(gflops) - # Create x-axis label and tick values inputTypeStr = "" x_name = "" xVals = [] - if "_square_vector_M=N" in gemvFilenames[i]: - x_name = "Value of M, N" - inputTypeStr = "Square x Vector (M=N)" - for j in range(0, len(mn)): - xVals.append(mn[j][0]) - elif "_tall-thin_vector_M=16N" in gemvFilenames[i]: - x_name = "Value of N where M=16N" - inputTypeStr = "Tall-Thin x Vector (M=16N)" - for j in range(0, len(mn)): - xVals.append(mn[j][1]) - elif "_tall-thin_vector_M_N=32" in gemvFilenames[i]: - x_name = "Value of M, where N=32" - inputTypeStr = "Tall-Thin x Vector (M, N=32)" - for j in range(0, len(mn)): - xVals.append(mn[j][0]) - elif "_short-wide_vector_N=16M" in gemvFilenames[i]: - x_name = "Value of M, where N=16M" - inputTypeStr = "Short-Wide x Vector (N=16M)" - for j in range(0, len(mn)): - xVals.append(mn[j][0]) - elif "_short-wide_vector_M=32_N" in gemvFilenames[i]: - x_name = "Value of N, where M=32" - inputTypeStr = "Short-Wide x Vector (M=32, N)" - for j in range(0, len(mn)): - xVals.append(mn[j][1]) + if "_square_square_M=N=K" in spmdnmFilenames[i]: + x_name = "Value of M, N, K" + inputTypeStr = "Square x Square (M=N=K)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][0]) + elif "_tall-thin_short-wide_M=N_M=16K" in spmdnmFilenames[i]: + x_name = "Value of K where M=16K and N=16K" + inputTypeStr = "Tall-Thin x Short-Wide (M=N=16K)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][2]) + elif "_tall-thin_short-wide_M=N_K=32" in spmdnmFilenames[i]: + x_name = "Value of M and N, where K=32" + inputTypeStr = "Tall-Thin x Short-Wide (M=N, K=32)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][0]) + elif "_short-wide_tall-thin_M=N_K=16M" in spmdnmFilenames[i]: + x_name = "Value of M and N, where K=16M" + inputTypeStr = "Short-Wide x Tall-Thin (M=N, K=16M)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][0]) + elif "_short-wide_tall-thin_M=N=32_K" in spmdnmFilenames[i]: + x_name = "Value of K, where M=32 and N=32" + inputTypeStr = "Short-Wide x Tall-Thin (M=N=32, K)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][2]) + elif "_tall-thin_square_K=N_M=16K" in spmdnmFilenames[i]: + x_name = "Value of N and K, where M=16K" + inputTypeStr = "Tall-Thin x Square (N=K, M=16K)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][2]) + elif "_tall-thin_square_K=N=32_M" in spmdnmFilenames[i]: + x_name = "Value of M, where N=32 and K=32" + inputTypeStr = "Tall-Thin x Square (M, N=K=32)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][0]) + elif "_square_short-wide_M=K_N=16K" in spmdnmFilenames[i]: + x_name = "Value of M and K, where N=16K" + inputTypeStr = "Square x Short-Wide (M=K, N=16K)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][0]) + elif "_square_short-wide_M=K=32_N" in spmdnmFilenames[i]: + x_name = "Value of N, where M=32 and K=32" + inputTypeStr = "Square x Short-Wide (M=K=32, N)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][1]) else: # File not supported so go to next file continue + # Create y-axis label & graph title + y_name = "" + title = "" + fp = "" + if kernel == "sspmdnm" : + fp = "FP32" + elif kernel == "dspmdnm": + fp = "FP64" + y_name = "{} GFLOP/s".format(fp) + title = ("{}SpMDnM Performance for {} Problems (sparsity = {})- {} " + "iterations per problemize").format(kernel[0].upper(), + inputTypeStr, sparsity, iters) + + # Make Graph + fig1 = plt.figure(figsize=(28,16)) + ax1 = fig1.add_subplot() + + gpuEnabled = False + if len(cpu_Gflops) > 0: + ax1.plot(xVals, cpu_Gflops, color="#332288", marker=".", label="CPU") + # Plot line at max GFLOP/s + yCoord = round(max(cpu_Gflops),1) + ax1.axhline(yCoord, color='black', linestyle='--') + ax1.text(x=0, y=yCoord, s="Max CPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom') + if len(gpuO_Gflops) > 0: + ax1.plot(xVals, gpuO_Gflops, color="#44AA99", marker="x", label="GPU (Offload Once)") + gpuEnabled = True + if len(gpuA_Gflops) > 0: + ax1.plot(xVals, gpuA_Gflops, color="#CC6677", marker="+", label="GPU (Offload Always)") + gpuEnabled = True + if len(gpuU_Gflops) > 0: + ax1.plot(xVals, gpuU_Gflops, color="#DDCC77", marker=">", label="GPU (Unified Memory)") + gpuEnabled = True + + if(gpuEnabled): + yCoord = round(max([max(gpuO_Gflops), max(gpuA_Gflops), max(gpuU_Gflops)]) ,1) + ax1.axhline(yCoord, color='black', linestyle='--') + ax1.text(x=0, y=yCoord, s="Max GPU GFLOP/s : {:,}".format(yCoord), fontsize=12, ha='left', va='bottom') + + # Set X ticks + NUM_TICK = 8 + numXVals = len(xVals) + if numXVals < NUM_TICK: + # Print all labels + plt.xticks(ticks=range(0, numXVals, 1), labels=xVals, fontsize=20) + else: + # Calculate labels + locInterval = int((numXVals) / (NUM_TICK-1)) + tickLocs = [0] + for q in range(1, (NUM_TICK-1)): + tickLocs.append(1 + (locInterval * q)) + tickLocs.append(numXVals - 1) + + labelInterval = int((int(xVals[-1]) - int(xVals[0])) / (NUM_TICK-1)) + tickLabs = [xVals[0]] + for q in range(1, (NUM_TICK-1)): + tickLabs.append(int(xVals[0]) + (labelInterval * q)) + tickLabs.append(int(xVals[-1])) + + plt.xticks(ticks=tickLocs, labels=tickLabs, fontsize=20) + + # Force setting of y-axis labels. If this isn't done then the range is weird... + yLoc, yLab = plt.yticks() + yLoc = yLoc.tolist() + # Remove negative first element of the list + if yLoc[0] != 0: + yLoc = yLoc[1:] + plt.ylim(0, yLoc[-1]) + plt.yticks(ticks=yLoc, fontsize=20) + + plt.margins(x=0.01, y=0.01) + leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18) + for obj in leg.legend_handles: + obj.set_linewidth(3.0) + obj.set_markersize(15.0) + obj.set_markeredgewidth(3.0) + plt.xlabel(x_name, fontsize=20) + plt.ylabel(y_name, fontsize=20) + plt.title(title, fontsize=20) + plt.savefig(fname="{}/{}.pdf".format(graphDir, spmdnmFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight") + plt.close('all') + print("\tPDF made") + + +print("Finished!") +# --------------------------------------------------------------------------------------- + +# ------------------------------ SpMSpM Graphs -------------------------------------------- +print("Creating SpMSpM graphs...") +# Create SpMSpM graphs +spmspmFilenames = [] +for i in range(0, len(filenames)): + if "spmspm_" in filenames[i]: + spmspmFilenames.append(filenames[i]) + +### CSV header format ==== Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total Seconds,GFLOP/s +for i in range(0, len(spmspmFilenames)): + mnk = [] + iters = 0 + kernel = "" + cpu_Gflops = [] + gpuO_Gflops = [] + gpuA_Gflops = [] + gpuU_Gflops = [] + + # Open file and get all lines + fName = os.path.join(os.getcwd(), directory, spmspmFilenames[i]) + openFile = open(fName, 'r') + lines = openFile.readlines() + lines.pop(0) # Remove headers + if len(lines) == 0 : + continue + + # Get number of iterations performed and kernel name + line1 = lines[0].split(',') + sparsity = float(line1[6]) + iters = int(line1[7]) + kernel = line1[1] + + # Get gflops (y-axis) and MNK values (x-axis) for CPU and all GPU types + for line in lines: + line = line.split(',') + # Get MNK + if (len(mnk) == 0) or ([line[2], line[3], line[4]] not in mnk): + mnk.append([line[2], line[3], line[4]]) + # Get Gflops + gflops = float(line[-1].rstrip()) + if line[0] == "cpu": + cpu_Gflops.append(gflops) + elif line[0] == "gpu_offloadOnce": + gpuO_Gflops.append(gflops) + elif line[0] == "gpu_offloadAlways": + gpuA_Gflops.append(gflops) + elif line[0] == "gpu_unified": + gpuU_Gflops.append(gflops) + # Create x-axis label and tick values + inputTypeStr = "" + x_name = "" + xVals = [] + if "_square_square_M=N=K" in spmspmFilenames[i]: + x_name = "Value of M, N, K" + inputTypeStr = "Square x Square (M=N=K)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][0]) + elif "_tall-thin_short-wide_M=N_M=16K" in spmspmFilenames[i]: + x_name = "Value of K where M=16K and N=16K" + inputTypeStr = "Tall-Thin x Short-Wide (M=N=16K)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][2]) + elif "_tall-thin_short-wide_M=N_K=32" in spmspmFilenames[i]: + x_name = "Value of M and N, where K=32" + inputTypeStr = "Tall-Thin x Short-Wide (M=N, K=32)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][0]) + elif "_short-wide_tall-thin_M=N_K=16M" in spmspmFilenames[i]: + x_name = "Value of M and N, where K=16M" + inputTypeStr = "Short-Wide x Tall-Thin (M=N, K=16M)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][0]) + elif "_short-wide_tall-thin_M=N=32_K" in spmspmFilenames[i]: + x_name = "Value of K, where M=32 and N=32" + inputTypeStr = "Short-Wide x Tall-Thin (M=N=32, K)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][2]) + elif "_tall-thin_square_K=N_M=16K" in spmspmFilenames[i]: + x_name = "Value of N and K, where M=16K" + inputTypeStr = "Tall-Thin x Square (N=K, M=16K)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][2]) + elif "_tall-thin_square_K=N=32_M" in spmspmFilenames[i]: + x_name = "Value of M, where N=32 and K=32" + inputTypeStr = "Tall-Thin x Square (M, N=K=32)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][0]) + elif "_square_short-wide_M=K_N=16K" in spmspmFilenames[i]: + x_name = "Value of M and K, where N=16K" + inputTypeStr = "Square x Short-Wide (M=K, N=16K)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][0]) + elif "_square_short-wide_M=K=32_N" in spmspmFilenames[i]: + x_name = "Value of N, where M=32 and K=32" + inputTypeStr = "Square x Short-Wide (M=K=32, N)" + for j in range(0, len(mnk)): + xVals.append(mnk[j][1]) + else: + # File not supported so go to next file + continue # Create y-axis label & graph title y_name = "" title = "" fp = "" - if kernel == "sgemv" : + if kernel == "sspmspm" : fp = "FP32" - elif kernel == "dgemv": + elif kernel == "dspmspm": fp = "FP64" y_name = "{} GFLOP/s".format(fp) - title = "{}GEMV Performance for {} Problems - {} iterations per problem size".format(kernel[0].upper(), inputTypeStr, iters) + title = ("{}SpMSpM Performance for {} Problems (sparsity = {})- {} " + "iterations per problemize").format(kernel[0].upper(), + inputTypeStr, sparsity, iters) # Make Graph fig1 = plt.figure(figsize=(28,16)) @@ -367,16 +912,17 @@ plt.margins(x=0.01, y=0.01) leg = plt.legend(loc='upper left', fancybox=True, ncol = 2, fontsize=18) - for obj in leg.legendHandles: + for obj in leg.legend_handles: obj.set_linewidth(3.0) obj.set_markersize(15.0) obj.set_markeredgewidth(3.0) plt.xlabel(x_name, fontsize=20) plt.ylabel(y_name, fontsize=20) plt.title(title, fontsize=20) - plt.savefig(fname="{}/{}.png".format(graphDir, gemvFilenames[i][:-4]), format="png", dpi=100, bbox_inches="tight") + plt.savefig(fname="{}/{}.pdf".format(graphDir, spmspmFilenames[i][:-4]), format="pdf", dpi=1000, bbox_inches="tight") plt.close('all') + print("\tPDF made") print("Finished!") -# --------------------------------------------------------------------------------------- \ No newline at end of file +# --------------------------------------------------------------------------------------- diff --git a/cuBLAS/common.hh b/cuBLAS/common.hh index 78d0270..af222fb 100644 --- a/cuBLAS/common.hh +++ b/cuBLAS/common.hh @@ -2,24 +2,45 @@ #if defined GPU_CUBLAS +#include +#include +#include + +/** Macro function to check if error occurred when calling cuBLAS. */ /** Macro function to check if error occurred when calling CUDA. */ -#define cudaCheckError(f) \ - do { \ - if (cudaError_t e = (f); e != cudaSuccess) { \ - std::cout << "CUDA error: " << __FILE__ << ":" << __LINE__ << ": " \ - << cudaGetErrorString(e) << std::endl; \ - exit(1); \ - } \ +#define cudaCheckError(f) \ + do { \ + if (cudaError_t e = (f); e != cudaSuccess) { \ + std::cout << "CUDA error: " << __FILE__ << ":" << __LINE__ << ": "; \ + std::cout << cudaGetErrorString(e) << std::endl; \ + exit(1); \ + } \ } while (false) /** Macro function to check if error occurred when calling cuBLAS. */ -#define cublasCheckError(f) \ - do { \ - if (cublasStatus_t e = (f); e != CUBLAS_STATUS_SUCCESS) { \ - std::cout << "CUBLAS error: " << __FILE__ << ":" << __LINE__ << ": " \ - << cublasGetStatusString(e) << std::endl; \ - exit(1); \ - } \ +#define cublasCheckError(f) \ + do { \ + cublasStatus_t status = (f); \ + if (status != CUBLAS_STATUS_SUCCESS) { \ + std::cout << "CUBLAS error: " << __FILE__ << ":" << __LINE__ << ": "; \ + std::cout << cublasGetStatusName(status) << " - "; \ + std::cout << cublasGetStatusString(status) << std::endl; \ + exit(1); \ + } \ } while (false) -#endif \ No newline at end of file +/** Macro function to check if error occurred when calling cuSPARSE. */ +#define cusparseCheckError(f) \ + do { \ + cusparseStatus_t status = (f); \ + if (status != CUSPARSE_STATUS_SUCCESS) { \ + std::cout << "CUSPARSE error: " << __FILE__ << ":" << __LINE__ << ": "; \ + std::cout << cusparseGetErrorName(status) << " - "; \ + std::cout << cusparseGetErrorString(status) << std::endl; \ + exit(1); \ + } \ + } while (false) \ + +#endif + + diff --git a/cuBLAS/spmdnm.hh b/cuBLAS/spmdnm.hh new file mode 100644 index 0000000..bb08d90 --- /dev/null +++ b/cuBLAS/spmdnm.hh @@ -0,0 +1,552 @@ +#pragma once + +#ifdef GPU_CUBLAS +#include +#include +#include +#include +#include + +#include "../include/kernels/GPU/spmdnm.hh" +#include "../include/utilities.hh" +#include "common.hh" + +namespace gpu { + /** + * A class for sparse matrix-dense matrix BLAS + */ +template +class spmdnm_gpu : public spmdnm { +public: + using spmdnm::spmdnm; + using spmdnm::initInputMatrices; + using spmdnm::m_; + using spmdnm::n_; + using spmdnm::k_; + using spmdnm::B_; + using spmdnm::C_; + using spmdnm::offload_; + using spmdnm::nnz_; + using spmdnm::sparsity_; + using spmdnm::type_; + + ~spmdnm_gpu() { + if (alreadyInitialised_) { + cusparseCheckError(cusparseDestroy(handle_)); + + cudaCheckError(cudaStreamDestroy(stream1_)); + cudaCheckError(cudaStreamDestroy(stream2_)); + cudaCheckError(cudaStreamDestroy(stream3_)); + cudaCheckError(cudaStreamDestroy(stream4_)); + cudaCheckError(cudaStreamDestroy(stream5_)); + + alreadyInitialised_ = false; + } + } + + void initialise(gpuOffloadType offload, int m, int n, int k, + double sparsity, matrixType type, bool binary = false) override { + if (!alreadyInitialised_) { + alreadyInitialised_ = true; + cusparseCheckError(cusparseCreate(&handle_)); + + cudaCheckError(cudaStreamCreate(&stream1_)); + cudaCheckError(cudaStreamCreate(&stream2_)); + cudaCheckError(cudaStreamCreate(&stream3_)); + cudaCheckError(cudaStreamCreate(&stream4_)); + cudaCheckError(cudaStreamCreate(&stream5_)); + + cusparseCheckError(cusparseSetStream(handle_, stream1_)); + + // Get device identifier + cudaCheckError(cudaGetDevice(&gpuDevice_)); + + } + offload_ = offload; + sparsity_ = sparsity; + type_ = type; + + m_ = m; + n_ = n; + k_ = k; + + B_ = C_ = B_dev_ = C_dev_ = A_vals_ = A_vals_dev_ = nullptr; + A_rows_ = A_cols_ = A_rows_dev_ = A_cols_dev_ = nullptr; + /** Determine the number of nnz elements in A and B */ + nnz_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_)); + + // Set up cuSPARSE metadata + opA_ = CUSPARSE_OPERATION_NON_TRANSPOSE; + opB_ = CUSPARSE_OPERATION_NON_TRANSPOSE; + alg_ = CUSPARSE_SPMM_ALG_DEFAULT; + index_ = CUSPARSE_INDEX_64I; + base_ = CUSPARSE_INDEX_BASE_ZERO; + B_order_ = CUSPARSE_ORDER_ROW; + C_order_ = CUSPARSE_ORDER_ROW; + if (std::is_same_v) { + dataType_ = CUDA_R_32F; + } else if (std::is_same_v) { + dataType_ = CUDA_R_64F; + } else { + std::cerr << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl; + exit(1); + } + + if (offload_ == gpuOffloadType::unified) { + cudaCheckError(cudaMallocManaged(&B_, sizeof(T) * k_ * n_)); + cudaCheckError(cudaMallocManaged(&C_, sizeof(T) * m_ * n_)); + } else { + B_ = (T*)malloc(sizeof(T) * k_ * n_); + C_ = (T*)malloc(sizeof(T) * m_ * n_); + + cudaCheckError(cudaMalloc((void**)&B_dev_, sizeof(T) * k_ * n_)); + cudaCheckError(cudaMalloc((void**)&C_dev_, sizeof(T) * m_ * n_)); + } + cudaCheckError(cudaDeviceSynchronize()); + + initInputMatrices(); + } + +protected: + void toSparseFormat() override { + if (offload_ == gpuOffloadType::always) { + A_vals_store_ = (T*)malloc(sizeof(T) * nnz_); + A_cols_store_ = (int64_t*)malloc(sizeof(int64_t) * nnz_); + A_rows_store_ = (int64_t*)malloc(sizeof(int64_t) * (m_ + 1)); + + if (type_ == matrixType::rmat) { + rMatCSR(A_vals_store_, A_cols_store_, A_rows_store_, m_, k_, nnz_); + } else if (type_ == matrixType::random) { + randomCSR(A_vals_store_, A_cols_store_, A_rows_store_, m_, k_, nnz_); + } else if (type_ == matrixType::finiteElements) { + finiteElementCSR(A_vals_store_, A_cols_store_, A_rows_store_, m_, k_, nnz_); + } else { + exit(1); + } + } + + // Allocate CSR arrays + if (offload_ == gpuOffloadType::unified) { + cudaCheckError(cudaMallocManaged(&A_vals_, nnz_ * sizeof(T))); + cudaCheckError(cudaMallocManaged(&A_cols_, nnz_ * sizeof(int64_t))); + cudaCheckError(cudaMallocManaged(&A_rows_, (m_ + 1) * sizeof(int64_t))); + } else { + A_vals_ = (T*)malloc(nnz_ * sizeof(T)); + A_cols_ = (int64_t*)malloc(nnz_ * sizeof(int64_t)); + A_rows_ = (int64_t*)malloc((m_ + 1) * sizeof(int64_t)); + cudaCheckError(cudaMalloc((void**)&A_vals_dev_, nnz_ * sizeof(T))); + cudaCheckError(cudaMalloc((void**)&A_cols_dev_, nnz_ * sizeof(int64_t))); + cudaCheckError(cudaMalloc((void**)&A_rows_dev_, (m_ + 1) * sizeof(int64_t))); + } + cudaCheckError(cudaDeviceSynchronize()); + + memcpy(A_vals_, A_vals_store_, sizeof(T) * nnz_); + memcpy(A_cols_, A_cols_store_, sizeof(int64_t) * nnz_); + memcpy(A_rows_, A_rows_store_, sizeof(int64_t) * (m_ + 1)); + cudaCheckError(cudaDeviceSynchronize()); + } + +private: + void preLoopRequirements() override { + switch(offload_) { + case gpuOffloadType::always: { + break; + } + case gpuOffloadType::once: { + cudaCheckError(cudaMemcpyAsync(A_vals_dev_, A_vals_, nnz_ * sizeof(T), cudaMemcpyHostToDevice, stream1_)); + cudaCheckError(cudaMemcpyAsync(A_cols_dev_, A_cols_, nnz_ * sizeof(int64_t), cudaMemcpyHostToDevice, stream2_)); + cudaCheckError(cudaMemcpyAsync(A_rows_dev_, A_rows_, (m_ + 1) * sizeof(int64_t), cudaMemcpyHostToDevice, stream3_)); + cudaCheckError(cudaMemcpyAsync(B_dev_, B_, (k_ * n_) * sizeof(T), cudaMemcpyHostToDevice, stream4_)); + cudaCheckError(cudaMemcpyAsync(C_dev_, C_, (m_ * n_) * sizeof(T), cudaMemcpyHostToDevice, stream5_)); + break; + } + case gpuOffloadType::unified: { + cudaCheckError(cudaMemPrefetchAsync(A_vals_, nnz_ * sizeof(T), gpuDevice_, stream1_)); + cudaCheckError(cudaMemPrefetchAsync(A_cols_, nnz_ * sizeof(int64_t), gpuDevice_, stream2_)); + cudaCheckError(cudaMemPrefetchAsync(A_rows_, (m_ + 1) * sizeof(int64_t), gpuDevice_, stream3_)); + cudaCheckError(cudaMemPrefetchAsync(B_, (n_ * k_) * sizeof(T), gpuDevice_, stream4_)); + cudaCheckError(cudaMemPrefetchAsync(C_, (m_ * n_) * sizeof(T), gpuDevice_, stream5_)); + cudaCheckError(cudaDeviceSynchronize()); + break; + } + } + } + + void callSpmdnm() override { + switch(offload_) { + case gpuOffloadType::always: { + // Move over data + cudaCheckError(cudaMemcpyAsync(A_vals_dev_, A_vals_, nnz_ * sizeof(T), cudaMemcpyHostToDevice, stream1_)); + cudaCheckError(cudaMemcpyAsync(A_cols_dev_, A_cols_, nnz_ * sizeof(int64_t), cudaMemcpyHostToDevice, stream2_)); + cudaCheckError(cudaMemcpyAsync(A_rows_dev_, A_rows_, (m_ + 1) * sizeof(int64_t), cudaMemcpyHostToDevice, stream3_)); + cudaCheckError(cudaMemcpyAsync(B_dev_, B_, (k_ * n_) * sizeof(T), cudaMemcpyHostToDevice, stream4_)); + cudaCheckError(cudaMemcpyAsync(C_dev_, C_, (m_ * n_) * sizeof(T), cudaMemcpyHostToDevice, stream5_)); + + // Set up descriptors + cusparseCheckError(cusparseCreateCsr(&A_descr_, + m_, + k_, + nnz_, + A_rows_dev_, + A_cols_dev_, + A_vals_dev_, + index_, + index_, + base_, + dataType_)); + cusparseCheckError(cusparseCreateDnMat(&B_descr_, + k_, + n_, + n_, + B_dev_, + dataType_, + B_order_)); + cusparseCheckError(cusparseCreateDnMat(&C_descr_, + m_, + n_, + n_, + C_dev_, + dataType_, + C_order_)); + + // Set up temporary buffers + void* dBuffer = nullptr; + size_t bufferSize = 0; + + // Begin matrix-matrix multiplication + cusparseCheckError(cusparseSpMM_bufferSize(handle_, + opA_, + opB_, + &alpha, + A_descr_, + B_descr_, + &beta, + C_descr_, + dataType_, + alg_, + &bufferSize)); + + // Allocate the temporary buffer + cudaCheckError(cudaMalloc((void**)&dBuffer, bufferSize)); + + cusparseCheckError(cusparseSpMM_preprocess(handle_, + opA_, + opB_, + &alpha, + A_descr_, + B_descr_, + &beta, + C_descr_, + dataType_, + alg_, + dBuffer)); + + cusparseCheckError(cusparseSpMM(handle_, + opA_, + opB_, + &alpha, + A_descr_, + B_descr_, + &beta, + C_descr_, + dataType_, + alg_, + dBuffer)); + cudaCheckError(cudaDeviceSynchronize()); + + // Clean up descriptors + cusparseCheckError(cusparseDestroySpMat(A_descr_)); + cusparseCheckError(cusparseDestroyDnMat(B_descr_)); + cusparseCheckError(cusparseDestroyDnMat(C_descr_)); + + // Free up the temporary buffer + cudaCheckError(cudaFree(dBuffer)); + + // Move result back to CPU + cudaCheckError(cudaMemcpyAsync(C_, C_dev_, (sizeof(T) * m_ * n_), + cudaMemcpyDeviceToHost, stream1_)); + cudaCheckError(cudaDeviceSynchronize()); + break; + } + case gpuOffloadType::once: { + // Set up descriptors + cusparseCheckError(cusparseCreateCsr(&A_descr_, + m_, + k_, + nnz_, + A_rows_dev_, + A_cols_dev_, + A_vals_dev_, + index_, + index_, + base_, + dataType_)); + cusparseCheckError(cusparseCreateDnMat(&B_descr_, + k_, + n_, + n_, + B_dev_, + dataType_, + B_order_)); + cusparseCheckError(cusparseCreateDnMat(&C_descr_, + m_, + n_, + n_, + C_dev_, + dataType_, + C_order_)); + + size_t bufferSize = 0; + // Begin matrix-matrix multiplication + cusparseCheckError(cusparseSpMM_bufferSize(handle_, + opA_, + opB_, + &alpha, + A_descr_, + B_descr_, + &beta, + C_descr_, + dataType_, + alg_, + &bufferSize)); + + // Allocate the temporary buffer + void* dBuffer = nullptr; + cudaCheckError(cudaMalloc((void**)&dBuffer, bufferSize)); + cusparseCheckError(cusparseSpMM_preprocess(handle_, + opA_, + opB_, + &alpha, + A_descr_, + B_descr_, + &beta, + C_descr_, + dataType_, + alg_, + dBuffer)); + + cusparseCheckError(cusparseSpMM(handle_, + opA_, + opB_, + &alpha, + A_descr_, + B_descr_, + &beta, + C_descr_, + dataType_, + alg_, + dBuffer)); + + // Clean up descriptors + cusparseCheckError(cusparseDestroySpMat(A_descr_)); + cusparseCheckError(cusparseDestroyDnMat(B_descr_)); + cusparseCheckError(cusparseDestroyDnMat(C_descr_)); + + // Free up the temporary buffer + cudaCheckError(cudaFree(dBuffer)); + } + case gpuOffloadType::unified: { + // Create descriptors for the matrices + cusparseCheckError(cusparseCreateCsr(&A_descr_, + m_, + k_, + nnz_, + A_rows_, + A_cols_, + A_vals_, + index_, + index_, + base_, + dataType_)); + cusparseCheckError(cusparseCreateDnMat(&B_descr_, + k_, + n_, + n_, + B_, + dataType_, + B_order_)); + cusparseCheckError(cusparseCreateDnMat(&C_descr_, + m_, + n_, + n_, + C_, + dataType_, + C_order_)); + + // Set up temporary buffers + void* dBuffer = nullptr; + size_t bufferSize = 0; + + // Begin matrix-matrix multiplication + cusparseCheckError(cusparseSpMM_bufferSize(handle_, + opA_, + opB_, + &alpha, + A_descr_, + B_descr_, + &beta, + C_descr_, + dataType_, + alg_, + &bufferSize)); + + // Allocate the temporary buffer + cudaCheckError(cudaMalloc((void**)&dBuffer, bufferSize)); + + cusparseCheckError(cusparseSpMM_preprocess(handle_, + opA_, + opB_, + &alpha, + A_descr_, + B_descr_, + &beta, + C_descr_, + dataType_, + alg_, + dBuffer)); + cudaCheckError(cudaDeviceSynchronize()); + + cusparseCheckError(cusparseSpMM(handle_, + opA_, + opB_, + &alpha, + A_descr_, + B_descr_, + &beta, + C_descr_, + dataType_, + alg_, + dBuffer)); + cudaCheckError(cudaDeviceSynchronize()); + + // Clean up descriptors + cusparseCheckError(cusparseDestroySpMat(A_descr_)); + cusparseCheckError(cusparseDestroyDnMat(B_descr_)); + cusparseCheckError(cusparseDestroyDnMat(C_descr_)); + cudaCheckError(cudaDeviceSynchronize()); + + // Free up the temporary buffer + cudaCheckError(cudaFree(dBuffer)); + break; + } + } + } + + void postLoopRequirements() override { + switch (offload_) { + case gpuOffloadType::always: { + break; + } + case gpuOffloadType::once: { + // Move result back to CPU + cudaCheckError(cudaMemcpyAsync(C_, C_dev_, (sizeof(T) * m_ * n_), + cudaMemcpyDeviceToHost, stream1_)); + cudaCheckError(cudaDeviceSynchronize()); + break; + } + case gpuOffloadType::unified: { + // Move result back to CPU + cudaCheckError(cudaMemPrefetchAsync(C_, sizeof(T) * m_ * n_, + cudaCpuDeviceId, stream1_)); + cudaCheckError(cudaDeviceSynchronize()); + break; + } + } + } + + void postCallKernelCleanup() override { + if (offload_ == gpuOffloadType::unified) { + cudaCheckError(cudaFree(A_vals_)); + cudaCheckError(cudaFree(A_cols_)); + cudaCheckError(cudaFree(A_rows_)); + cudaCheckError(cudaFree(B_)); + cudaCheckError(cudaFree(C_)); + free(A_vals_store_); + free(A_cols_store_); + free(A_rows_store_); + } else { + free(A_vals_); + free(A_cols_); + free(A_rows_); + free(B_); + free(C_); + cudaCheckError(cudaFree(A_vals_dev_)); + cudaCheckError(cudaFree(A_cols_dev_)); + cudaCheckError(cudaFree(A_rows_dev_)); + cudaCheckError(cudaFree(B_dev_)); + cudaCheckError(cudaFree(C_dev_)); + } + } + + bool alreadyInitialised_ = false; + + /** Handle used when calling cuBLAS. */ + cusparseHandle_t handle_; + + /** CUDA Streams - used to asynchronously move data between host and device. */ + cudaStream_t stream1_; + cudaStream_t stream2_; + cudaStream_t stream3_; + cudaStream_t stream4_; + cudaStream_t stream5_; + + /** The ID of the target GPU Device. */ + int gpuDevice_; + + /** The constant value Alpha. */ + const T alpha = ALPHA; + + /** The constant value Beta. */ + const T beta = BETA; + + // cuSPARSE parameters + cusparseOperation_t opA_; + cusparseOperation_t opB_; + cusparseSpMMAlg_t alg_; + cusparseIndexType_t index_; + cusparseIndexBase_t base_; + cudaDataType_t dataType_; + + /** + * ___________ Host data ______________ + */ + /** CSR format vectors for matrix A */ + cusparseSpMatDescr_t A_descr_; + T* A_vals_; + int64_t* A_cols_; + int64_t* A_rows_; + int64_t A_num_rows_; + int64_t A_num_cols_; + + /** dense format values for matrices B and C */ + cusparseDnMatDescr_t B_descr_; + int64_t B_num_rows_; + int64_t B_num_cols_; + int64_t B_leading_dim_; + cusparseOrder_t B_order_; + + cusparseDnMatDescr_t C_descr_; + int64_t C_num_rows_; + int64_t C_num_cols_; + int64_t C_leading_dim_; + cusparseOrder_t C_order_; + + /** + * _____________ Device data ________________ + */ + T* A_vals_dev_; + int64_t* A_cols_dev_; + int64_t* A_rows_dev_; + + T* B_dev_; + + T* C_dev_; + + T* A_vals_store_; + int64_t* A_cols_store_; + int64_t* A_rows_store_; +}; +}; + + +#endif diff --git a/cuBLAS/spmdnv.hh b/cuBLAS/spmdnv.hh new file mode 100644 index 0000000..4d0317e --- /dev/null +++ b/cuBLAS/spmdnv.hh @@ -0,0 +1,484 @@ +#pragma once + +#ifdef GPU_CUBLAS +#include +#include +#include +#include +#include + +#include "../include/kernels/GPU/spmdnv.hh" +#include "../include/utilities.hh" +#include "common.hh" + +namespace gpu { +/** A class for SpMDnV GPU BLAS kernels. */ +template +class spmdnv_gpu : public spmdnv { + public: + using spmdnv::spmdnv; + using spmdnv::initInputMatrixVector; + using spmdnv::nnz_; + using spmdnv::m_; + using spmdnv::n_; + using spmdnv::x_; + using spmdnv::y_; + using spmdnv::offload_; + using spmdnv::sparsity_; + using spmdnv::type_; + + ~spmdnv_gpu() { + if (initialised_) { + cusparseCheckError(cusparseDestroy(handle_)); + + cudaCheckError(cudaStreamDestroy(s1_)); + cudaCheckError(cudaStreamDestroy(s2_)); + cudaCheckError(cudaStreamDestroy(s3_)); + cudaCheckError(cudaStreamDestroy(s4_)); + cudaCheckError(cudaStreamDestroy(s5_)); + + initialised_ = false; + } + } + + void initialise(gpuOffloadType offload, int m, int n, + double sparsity, matrixType type) override { + if (!initialised_) { + initialised_ = true; + cusparseCheckError(cusparseCreate(&handle_)); + + cudaCheckError(cudaStreamCreate(&s1_)); + cudaCheckError(cudaStreamCreate(&s2_)); + cudaCheckError(cudaStreamCreate(&s3_)); + cudaCheckError(cudaStreamCreate(&s4_)); + cudaCheckError(cudaStreamCreate(&s5_)); + + cusparseCheckError(cusparseSetStream(handle_, s1_)); + + // Get device identifier + cudaCheckError(cudaGetDevice(&gpuDevice_)); + } + + offload_ = offload; + sparsity_ = sparsity; + type_ = type; + + + // Setting cusparse metadata + if (std::is_same_v) { + dataType_ = CUDA_R_32F; + } else if (std::is_same_v) { + dataType_ = CUDA_R_64F; + } else { + std::cerr << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl; + exit(1); + } + opA_ = opB_ = CUSPARSE_OPERATION_NON_TRANSPOSE; + alg_ = CUSPARSE_SPMV_ALG_DEFAULT; + index_ = CUSPARSE_INDEX_64I; + base_ = CUSPARSE_INDEX_BASE_ZERO; + + m_ = m; + n_ = n; + nnz_ = 1 + (uint64_t)((double)m_ * (double)n_ * (1.0 - sparsity_)); + + // Allocate dense data structures + if (offload_ == gpuOffloadType::unified) { + cudaCheckError(cudaMallocManaged(&x_, n_ * sizeof(T))); + cudaCheckError(cudaMallocManaged(&y_, m_ * sizeof(T))); + cudaCheckError(cudaDeviceSynchronize()); + } else { + x_ = (T*)malloc(n_ * sizeof(T)); + y_ = (T*)malloc(m_ * sizeof(T)); + + cudaCheckError(cudaMalloc((void**)&x_dev_, n_ * sizeof(T))); + cudaCheckError(cudaMalloc((void**)&y_dev_, m_ * sizeof(T))); + cudaCheckError(cudaDeviceSynchronize()); + } + + initInputMatrixVector(); + } + +protected: + + void toSparseFormat() override { + if (offload_ == gpuOffloadType::always) { + A_vals_store_ = (T*)malloc(sizeof(T) * nnz_); + A_cols_store_ = (int64_t*)malloc(sizeof(int64_t) * nnz_); + A_rows_store_ = (int64_t*)malloc(sizeof(int64_t) * (m_ + 1)); + + if (type_ == matrixType::random) { + randomCSR(A_vals_store_, A_cols_store_, A_rows_store_, m_, n_, nnz_); + } else if (type_ == matrixType::rmat) { + rMatCSR(A_vals_store_, A_cols_store_, A_rows_store_, m_, n_, nnz_); + } else if (type_ == matrixType::finiteElements) { + finiteElementCSR(A_vals_store_, A_cols_store_, A_rows_store_, m_, n_, nnz_); + } else { + std::cerr << "Matrix type not supported" << std::endl; + exit(1); + } + } + + + if (offload_ == gpuOffloadType::unified) { + cudaCheckError(cudaMallocManaged(&A_vals_, nnz_ * sizeof(T))); + cudaCheckError(cudaMallocManaged(&A_cols_, nnz_ * sizeof(int64_t))); + cudaCheckError(cudaMallocManaged(&A_rows_, (m_ + 1) * sizeof(int64_t))); + } else { + A_vals_ = (T*)malloc(nnz_ * sizeof(T)); + A_cols_ = (int64_t*)malloc(nnz_ * sizeof(int64_t)); + A_rows_ = (int64_t*)malloc((m_ + 1) * sizeof(int64_t)); + cudaCheckError(cudaMalloc((void**)&A_vals_dev_, nnz_ * sizeof(T))); + cudaCheckError(cudaMalloc((void**)&A_cols_dev_, nnz_ * sizeof(int64_t))); + cudaCheckError(cudaMalloc((void**)&A_rows_dev_, (m_ + 1) * sizeof(int64_t))); + } + cudaCheckError(cudaDeviceSynchronize()); + + memcpy(A_vals_, A_vals_store_, sizeof(T) * nnz_); + memcpy(A_cols_, A_cols_store_, sizeof(int64_t) * nnz_); + memcpy(A_rows_, A_rows_store_, sizeof(int64_t) * (m_ + 1)); + cudaCheckError(cudaDeviceSynchronize()); + } + + private: + void preLoopRequirements() override { + switch(offload_) { + case gpuOffloadType::always: { + break; + } + case gpuOffloadType::once: { + cudaCheckError(cudaMemcpyAsync(A_vals_dev_, A_vals_, nnz_ * sizeof(T), cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(A_cols_dev_, A_cols_, nnz_ * sizeof(int64_t), cudaMemcpyHostToDevice, s2_)); + cudaCheckError(cudaMemcpyAsync(A_rows_dev_, A_rows_, (m_ + 1) * sizeof(int64_t), cudaMemcpyHostToDevice, s3_)); + cudaCheckError(cudaMemcpyAsync(x_dev_, x_, n_ * sizeof(T), cudaMemcpyHostToDevice, s4_)); + cudaCheckError(cudaMemcpyAsync(y_dev_, y_, m_ * sizeof(T), cudaMemcpyHostToDevice, s5_)); + cudaCheckError(cudaDeviceSynchronize()); + break; + } + case gpuOffloadType::unified: { + // Prefetch memory to device + cudaCheckError(cudaMemPrefetchAsync(A_vals_, nnz_ * sizeof(T), gpuDevice_, s1_)); + cudaCheckError(cudaMemPrefetchAsync(A_cols_, nnz_ * sizeof(int64_t), gpuDevice_, s2_)); + cudaCheckError(cudaMemPrefetchAsync(A_rows_, (m_ + 1) * sizeof(int64_t), gpuDevice_, s3_)); + cudaCheckError(cudaMemPrefetchAsync(x_, n_ * sizeof(T), gpuDevice_, s4_)); + cudaCheckError(cudaMemPrefetchAsync(y_, m_ * sizeof(T), gpuDevice_, s5_)); + cudaCheckError(cudaDeviceSynchronize()); + break; + } + } + } + + /** Make a call to the BLAS Library Kernel. */ + void callSpMDnV() override { + switch(offload_) { + case gpuOffloadType::always: { + cudaCheckError(cudaMemcpyAsync(A_vals_dev_, A_vals_, nnz_ * sizeof(T), cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(A_cols_dev_, A_cols_, nnz_ * sizeof(int64_t), cudaMemcpyHostToDevice, s2_)); + cudaCheckError(cudaMemcpyAsync(A_rows_dev_, A_rows_, (m_ + 1) * sizeof(int64_t), cudaMemcpyHostToDevice, s3_)); + cudaCheckError(cudaMemcpyAsync(x_dev_, x_, n_ * sizeof(T), cudaMemcpyHostToDevice, s4_)); + cudaCheckError(cudaMemcpyAsync(y_dev_, y_, m_ * sizeof(T), cudaMemcpyHostToDevice, s5_)); + + cusparseCheckError(cusparseCreateCsr(&A_descr_, + m_, + n_, + nnz_, + A_rows_dev_, + A_cols_dev_, + A_vals_dev_, + index_, + index_, + base_, + dataType_)); + cusparseCheckError(cusparseCreateDnVec(&x_descr_, + n_, + x_dev_, + dataType_)); + cusparseCheckError(cusparseCreateDnVec(&y_descr_, + m_, + y_dev_, + dataType_)); + cudaCheckError(cudaDeviceSynchronize()); + + size_t bufferSize; + void* dBuffer = nullptr; + cusparseCheckError(cusparseSpMV_bufferSize(handle_, + opA_, + &alpha, + A_descr_, + x_descr_, + &beta, + y_descr_, + dataType_, + alg_, + &bufferSize)); + cudaCheckError(cudaDeviceSynchronize()); + + if (bufferSize > 0) cudaCheckError(cudaMalloc(&dBuffer, bufferSize)); + cudaCheckError(cudaDeviceSynchronize()); + + cusparseCheckError(cusparseSpMV(handle_, + opA_, + &alpha, + A_descr_, + x_descr_, + &beta, + y_descr_, + dataType_, + alg_, + dBuffer)); + cudaCheckError(cudaDeviceSynchronize()); + + cusparseCheckError(cusparseDestroySpMat(A_descr_)); + cusparseCheckError(cusparseDestroyDnVec(x_descr_)); + cusparseCheckError(cusparseDestroyDnVec(y_descr_)); + + cudaCheckError(cudaDeviceSynchronize()); + if (dBuffer != nullptr) cudaCheckError(cudaFree(dBuffer)); + + cudaCheckError(cudaMemcpyAsync(y_, y_dev_, m_ * sizeof(T), cudaMemcpyDeviceToHost, s2_)); + cudaCheckError(cudaDeviceSynchronize()); + break; + } + case gpuOffloadType::once: { + cusparseCheckError(cusparseCreateCsr(&A_descr_, + m_, + n_, + nnz_, + A_rows_dev_, + A_cols_dev_, + A_vals_dev_, + index_, + index_, + base_, + dataType_)); + cusparseCheckError(cusparseCreateDnVec(&x_descr_, + n_, + x_dev_, + dataType_)); + cusparseCheckError(cusparseCreateDnVec(&y_descr_, + m_, + y_dev_, + dataType_)); + cudaCheckError(cudaDeviceSynchronize()); + + size_t bufferSize; + void* dBuffer = nullptr; + cusparseCheckError(cusparseSpMV_bufferSize(handle_, + opA_, + &alpha, + A_descr_, + x_descr_, + &beta, + y_descr_, + dataType_, + alg_, + &bufferSize)); + cudaCheckError(cudaDeviceSynchronize()); + + if (bufferSize > 0) cudaCheckError(cudaMalloc(&dBuffer, bufferSize)); + cudaCheckError(cudaDeviceSynchronize()); + + cusparseCheckError(cusparseSpMV(handle_, + opA_, + &alpha, + A_descr_, + x_descr_, + &beta, + y_descr_, + dataType_, + alg_, + dBuffer)); + cudaCheckError(cudaDeviceSynchronize()); + + cusparseCheckError(cusparseDestroySpMat(A_descr_)); + cusparseCheckError(cusparseDestroyDnVec(x_descr_)); + cusparseCheckError(cusparseDestroyDnVec(y_descr_)); + cudaCheckError(cudaDeviceSynchronize()); + if (dBuffer != nullptr) cudaCheckError(cudaFree(dBuffer)); + cudaCheckError(cudaDeviceSynchronize()); + break; + } + case gpuOffloadType::unified: { + cusparseCheckError(cusparseCreateCsr(&A_descr_, + m_, + n_, + nnz_, + A_rows_, + A_cols_, + A_vals_, + index_, + index_, + base_, + dataType_)); + cusparseCheckError(cusparseCreateDnVec(&x_descr_, + n_, + x_, + dataType_)); + cusparseCheckError(cusparseCreateDnVec(&y_descr_, + m_, + y_, + dataType_)); + cudaCheckError(cudaDeviceSynchronize()); + /* + * Workflow is : + * cusparseSpMV_bufferSize + * cusparseSpMV + */ + size_t bufferSize; + void* dBuffer = nullptr; + cusparseCheckError(cusparseSpMV_bufferSize(handle_, + opA_, + &alpha, + A_descr_, + x_descr_, + &beta, + y_descr_, + dataType_, + alg_, + &bufferSize)); + cudaCheckError(cudaDeviceSynchronize()); + + // TODO -- cusparseSpMV_preprocess() + + if (bufferSize > 0) cudaCheckError(cudaMalloc(&dBuffer, bufferSize)); + cudaCheckError(cudaDeviceSynchronize()); + + cusparseCheckError(cusparseSpMV(handle_, + opA_, + &alpha, + A_descr_, + x_descr_, + &beta, + y_descr_, + dataType_, + alg_, + dBuffer)); + + cusparseCheckError(cusparseDestroySpMat(A_descr_)); + cusparseCheckError(cusparseDestroyDnVec(x_descr_)); + cusparseCheckError(cusparseDestroyDnVec(y_descr_)); + cudaCheckError(cudaDeviceSynchronize()); + if (dBuffer != nullptr) cudaCheckError(cudaFree(dBuffer)); + cudaCheckError(cudaDeviceSynchronize()); + break; + } + } + } + + /** Perform any required steps after calling the SpMDnV kernel that should + * be timed. */ + void postLoopRequirements() override { + switch(offload_) { + case gpuOffloadType::always: { + break; + } + case gpuOffloadType::once: { + cudaCheckError(cudaMemcpyAsync(y_, y_dev_, sizeof(T) * m_, cudaMemcpyDeviceToHost, s3_)); + break; + } + case gpuOffloadType::unified: { + cudaCheckError(cudaMemPrefetchAsync(y_, m_ * sizeof(T), cudaCpuDeviceId, s3_)); + break; + } + } + cudaCheckError(cudaDeviceSynchronize()); + } + + /** Do any necessary cleanup (free pointers, close library handles, etc.) + * after Kernel has been called. */ + void postCallKernelCleanup() override { + if (offload_ == gpuOffloadType::unified) { + cudaCheckError(cudaFree(A_vals_)); + cudaCheckError(cudaFree(A_cols_)); + cudaCheckError(cudaFree(A_rows_)); + cudaCheckError(cudaFree(x_)); + cudaCheckError(cudaFree(y_)); + free(A_vals_store_); + free(A_cols_store_); + free(A_rows_store_); + } else { + free(A_vals_); + free(A_cols_); + free(A_rows_); + free(x_); + free(y_); + cudaCheckError(cudaFree(A_vals_dev_)); + cudaCheckError(cudaFree(A_cols_dev_)); + cudaCheckError(cudaFree(A_rows_dev_)); + cudaCheckError(cudaFree(x_dev_)); + cudaCheckError(cudaFree(y_dev_)); + } + } + + bool initialised_ = false; + + /** + * ################################ + * CUSPARSE STUFF + * ################################ + */ + /** Handle used when calling cuBLAS. */ + cusparseHandle_t handle_; + + /** CUDA Streams - used to asynchronously move data between host and device. + */ + cudaStream_t s1_; + cudaStream_t s2_; + cudaStream_t s3_; + cudaStream_t s4_; + cudaStream_t s5_; + + /** The ID of the target GPU Device. */ + int gpuDevice_; + + // Create descriptors for matrices A->C + cusparseSpMatDescr_t A_descr_; + cusparseDnVecDescr_t x_descr_, y_descr_; + + // cusparse metadata variables + cudaDataType_t dataType_; + cusparseOperation_t opA_; + cusparseOperation_t opB_; + cusparseSpMVAlg_t alg_; + cusparseIndexType_t index_; + cusparseIndexBase_t base_; + + /** The constant value Alpha. */ + const T alpha = ALPHA; + + /** The constant value Beta. */ + const T beta = BETA; + + /** + * ################################ + * Matrix A parameters + * ################################ + */ + /** CSR format vectors for storage of matrix between offload type runs */ + T* A_vals_store_; + int64_t* A_cols_store_; + int64_t* A_rows_store_; + + /** CSR format vectors on the host (also used for USM) */ + T* A_vals_; + int64_t* A_cols_; + int64_t* A_rows_; + /** CSR format vectors on the device. */ + T* A_vals_dev_; + int64_t* A_cols_dev_; + int64_t* A_rows_dev_; + + /** + * ################################ + * Vectors x and y parameters + * ################################ + */ + /** Vectors on the host (also used for USM) */ + T* x_host_; + T* y_host_; + /** Vectors on the device */ + T* x_dev_; + T* y_dev_; +}; +} // namespace gpu +#endif \ No newline at end of file diff --git a/cuBLAS/spmspm.hh b/cuBLAS/spmspm.hh new file mode 100644 index 0000000..5ec847c --- /dev/null +++ b/cuBLAS/spmspm.hh @@ -0,0 +1,907 @@ +#pragma once + +#ifdef GPU_CUBLAS +#include +#include +#include +#include +#include + +#include "../include/kernels/GPU/spmspm.hh" +#include "../include/utilities.hh" +#include "common.hh" + +namespace gpu { +/** A class for sparse GEMM GPU BLAS kernels. */ +template +class spmspm_gpu : public spmspm { + public: + using spmspm::spmspm; + using spmspm::initInputMatrices; + using spmspm::A_nnz_; + using spmspm::B_nnz_; + using spmspm::m_; + using spmspm::n_; + using spmspm::k_; + using spmspm::offload_; + using spmspm::sparsity_; + using spmspm::type_; + using spmspm::C_nnz_; + using spmspm::C_rows_; + using spmspm::C_cols_; + using spmspm::C_vals_; + + ~spmspm_gpu() { + if (alreadyInitialised_) { + cusparseCheckError(cusparseDestroy(handle_)); + + cudaCheckError(cudaStreamDestroy(s1_)); + cudaCheckError(cudaStreamDestroy(s2_)); + cudaCheckError(cudaStreamDestroy(s3_)); + cudaCheckError(cudaStreamDestroy(s4_)); + cudaCheckError(cudaStreamDestroy(s5_)); + cudaCheckError(cudaStreamDestroy(s6_)); + + alreadyInitialised_ = false; + } + } + + /** Initialise the required data structures. + * `offload` refers to the data offload type: + * - Once: Move data from host to device before all iterations & move from + * device to host after all iterations + * - Always: Move data from host to device and device to host each iteration + * - Unified: Initialise data as unified memory; no data movement semantics + * required */ + void initialise(gpuOffloadType offload, int n, int m, int k, + double sparsity, matrixType type, + bool binary = false) override { + if (!alreadyInitialised_) { + alreadyInitialised_ = true; + cusparseCheckError(cusparseCreate(&handle_)); + + cudaCheckError(cudaStreamCreate(&s1_)); + cudaCheckError(cudaStreamCreate(&s2_)); + cudaCheckError(cudaStreamCreate(&s3_)); + cudaCheckError(cudaStreamCreate(&s4_)); + cudaCheckError(cudaStreamCreate(&s5_)); + cudaCheckError(cudaStreamCreate(&s6_)); + + cusparseCheckError(cusparseSetStream(handle_, s1_)); + + // Get device identifier + cudaCheckError(cudaGetDevice(&gpuDevice_)); + } + + offload_ = offload; + sparsity_ = sparsity; + type_ = type; + + m_ = m; + n_ = n; + k_ = k; + + /** Determine the number of nnz elements in A and B */ + A_nnz_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_)); + B_nnz_ = 1 + (uint64_t)((double)k_ * (double)n_ * (1.0 - sparsity_)); + + opA_ = CUSPARSE_OPERATION_NON_TRANSPOSE; + opB_ = CUSPARSE_OPERATION_NON_TRANSPOSE; + alg_ = CUSPARSE_SPGEMM_ALG2; + index_ = CUSPARSE_INDEX_32I; + base_ = CUSPARSE_INDEX_BASE_ZERO; + if (std::is_same_v) dataType_ = CUDA_R_32F; + else if (std::is_same_v) dataType_ = CUDA_R_64F; + else { + std::cerr << "INVALID DATA TYPE PASSED TO cuSPARSE" << std::endl; + exit(1); + } + + initInputMatrices(); + } + + protected: + void toSparseFormat() override { + if (offload_ == gpuOffloadType::always) { + A_vals_store_ = (T*)malloc(sizeof(T) * A_nnz_); + A_cols_store_ = (int32_t*)malloc(sizeof(int32_t) * A_nnz_); + A_rows_store_ = (int32_t*)malloc(sizeof(int32_t) * (m_ + 1)); + B_vals_store_ = (T*)malloc(sizeof(T) * B_nnz_); + B_cols_store_ = (int32_t*)malloc(sizeof(int32_t) * B_nnz_); + B_rows_store_ = (int32_t*)malloc(sizeof(int32_t) * (k_ + 1)); + + int seedOffset = 0; + do { + if (type_ == matrixType::rmat) { + rMatCSR(A_vals_store_, A_cols_store_, A_rows_store_, m_, k_, A_nnz_, SEED + seedOffset++); + rMatCSR(B_vals_store_, B_cols_store_, B_rows_store_, k_, n_, B_nnz_, SEED + seedOffset++); + } else if (type_ == matrixType::random) { + randomCSR(A_vals_store_, A_cols_store_, A_rows_store_, m_, k_, A_nnz_, SEED + seedOffset++); + randomCSR(B_vals_store_, B_cols_store_, B_rows_store_, k_, n_, B_nnz_, SEED + seedOffset++); + } else if (type_ == matrixType::finiteElements) { + finiteElementCSR(A_vals_store_, A_cols_store_, A_rows_store_, m_, k_, A_nnz_, SEED + seedOffset++); + finiteElementCSR(B_vals_store_, B_cols_store_, B_rows_store_, k_, n_, B_nnz_, SEED + seedOffset++); + } else { + std::cerr << "Matrix type not supported" << std::endl; + exit(1); + } + } while (calcCNNZ(m_, A_nnz_, A_rows_store_, A_cols_store_, k_, B_nnz_, B_rows_store_, B_cols_store_) == 0); + } + + // Allocate CSR arrays + if (offload_ == gpuOffloadType::unified) { + cudaCheckError(cudaMallocManaged(&A_vals_, sizeof(T) * A_nnz_)); + cudaCheckError(cudaMallocManaged(&A_cols_, sizeof(int32_t) * A_nnz_)); + cudaCheckError(cudaMallocManaged(&A_rows_, sizeof(int32_t) * (m_ + 1))); + cudaCheckError(cudaMallocManaged(&B_vals_, sizeof(T) * B_nnz_)); + cudaCheckError(cudaMallocManaged(&B_cols_, sizeof(int32_t) * B_nnz_)); + cudaCheckError(cudaMallocManaged(&B_rows_, sizeof(int32_t) * (k_ + 1))); + cudaCheckError(cudaMallocManaged(&C_rows_32_, sizeof(int32_t) * (m_ + 1))); + C_vals_ = nullptr; + C_cols_32_ = nullptr; + } else { + A_vals_ = (T*)malloc(sizeof(T) * A_nnz_); + A_cols_ = (int32_t*)malloc(sizeof(int32_t) * A_nnz_); + A_rows_ = (int32_t*)malloc(sizeof(int32_t) * (m_ + 1)); + B_vals_ = (T*)malloc(sizeof(T) * B_nnz_); + B_cols_ = (int32_t*)malloc(sizeof(int32_t) * B_nnz_); + B_rows_ = (int32_t*)malloc(sizeof(int32_t) * (k_ + 1)); + C_rows_32_ = (int32_t*)malloc(sizeof(int32_t) * (m_ + 1)); + C_vals_ = nullptr; + C_cols_32_ = nullptr; + + cudaCheckError(cudaMalloc((void**)&A_vals_dev_, sizeof(T) * A_nnz_)); + cudaCheckError(cudaMalloc((void**)&A_cols_dev_, sizeof(int32_t) * A_nnz_)); + cudaCheckError(cudaMalloc((void**)&A_rows_dev_, sizeof(int32_t) * (m_ + 1))); + cudaCheckError(cudaMalloc((void**)&B_vals_dev_, sizeof(T) * B_nnz_)); + cudaCheckError(cudaMalloc((void**)&B_cols_dev_, sizeof(int32_t) * B_nnz_)); + cudaCheckError(cudaMalloc((void**)&B_rows_dev_, sizeof(int32_t) * (k_ + 1))); + cudaCheckError(cudaMalloc((void**)&C_rows_dev_, sizeof(int32_t) * (m_ + 1))); + C_vals_dev_ = nullptr; + C_cols_dev_ = nullptr; + } + + // Move data into the correct arrays + memcpy(A_vals_, A_vals_store_, sizeof(T) * A_nnz_); + memcpy(A_cols_, A_cols_store_, sizeof(int32_t) * A_nnz_); + memcpy(A_rows_, A_rows_store_, sizeof(int32_t) * (m_ + 1)); + memcpy(B_vals_, B_vals_store_, sizeof(T) * B_nnz_); + memcpy(B_cols_, B_cols_store_, sizeof(int32_t) * B_nnz_); + memcpy(B_rows_, B_rows_store_, sizeof(int32_t) * (k_ + 1)); + } + + private: + /** Perform any required steps before calling the GEMM kernel that should + * be timed. */ + void preLoopRequirements() override { + switch(offload_) { + case gpuOffloadType::always: { + break; + } + case gpuOffloadType::once: { + cudaCheckError(cudaMemcpyAsync(A_vals_dev_, A_vals_, sizeof(T) * A_nnz_, cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(A_cols_dev_, A_cols_, sizeof(int32_t) * A_nnz_, cudaMemcpyHostToDevice, s2_)); + cudaCheckError(cudaMemcpyAsync(A_rows_dev_, A_rows_, sizeof(int32_t) * (m_ + 1), cudaMemcpyHostToDevice, s3_)); + cudaCheckError(cudaMemcpyAsync(B_vals_dev_, B_vals_, sizeof(T) * B_nnz_, cudaMemcpyHostToDevice, s4_)); + cudaCheckError(cudaMemcpyAsync(B_cols_dev_, B_cols_, sizeof(int32_t) * B_nnz_, cudaMemcpyHostToDevice, s5_)); + cudaCheckError(cudaMemcpyAsync(B_rows_dev_, B_rows_, sizeof(int32_t) * (k_ + 1), cudaMemcpyHostToDevice, s6_)); + break; + } + case gpuOffloadType::unified: { + // Prefetch memory to device + cudaCheckError(cudaMemPrefetchAsync(A_vals_, sizeof(T) * A_nnz_, gpuDevice_, s1_)); + cudaCheckError(cudaMemPrefetchAsync(A_cols_, sizeof(int32_t) * A_nnz_, gpuDevice_, s2_)); + cudaCheckError(cudaMemPrefetchAsync(A_rows_, sizeof(int32_t) * (m_ + 1), gpuDevice_, s3_)); + cudaCheckError(cudaMemPrefetchAsync(B_vals_, sizeof(T) * B_nnz_, gpuDevice_, s4_)); + cudaCheckError(cudaMemPrefetchAsync(B_cols_, sizeof(int32_t) * B_nnz_, gpuDevice_, s5_)); + cudaCheckError(cudaMemPrefetchAsync(B_rows_, sizeof(int32_t) * (k_ + 1), gpuDevice_, s6_)); + break; + } + } + } + + /** Make a call to the BLAS Library Kernel. */ + void callSpmspm() override { + switch(offload_) { + case gpuOffloadType::always: { + if (C_allocated) { + free(C_vals_); + free(C_cols_32_); + C_allocated = false; + } + + cudaCheckError(cudaMemcpyAsync(A_vals_dev_, A_vals_, sizeof(T) * A_nnz_, cudaMemcpyHostToDevice, s1_)); + cudaCheckError(cudaMemcpyAsync(A_cols_dev_, A_cols_, sizeof(int32_t) * A_nnz_, cudaMemcpyHostToDevice, s2_)); + cudaCheckError(cudaMemcpyAsync(A_rows_dev_, A_rows_, sizeof(int32_t) * (m_ + 1), cudaMemcpyHostToDevice, s3_)); + cudaCheckError(cudaMemcpyAsync(B_vals_dev_, B_vals_, sizeof(T) * B_nnz_, cudaMemcpyHostToDevice, s4_)); + cudaCheckError(cudaMemcpyAsync(B_cols_dev_, B_cols_, sizeof(int32_t) * B_nnz_, cudaMemcpyHostToDevice, s5_)); + cudaCheckError(cudaMemcpyAsync(B_rows_dev_, B_rows_, sizeof(int32_t) * (k_ + 1), cudaMemcpyHostToDevice, s6_)); + cudaCheckError(cudaDeviceSynchronize()); + + // Make matrix descriptors + cusparseCheckError(cusparseCreateCsr(&A_descr_, + m_, + k_, + A_nnz_, + A_rows_dev_, + A_cols_dev_, + A_vals_dev_, + index_, + index_, + base_, + dataType_)); + cusparseCheckError(cusparseCreateCsr(&B_descr_, + k_, + n_, + B_nnz_, + B_rows_dev_, + B_cols_dev_, + B_vals_dev_, + index_, + index_, + base_, + dataType_)); + cusparseCheckError(cusparseCreateCsr(&C_descr_, + m_, + n_, + 0, + nullptr, + nullptr, + nullptr, + index_, + index_, + base_, + dataType_)); + + cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDescr_)); + + size_t bufferSize1 = 0; + cusparseCheckError(cusparseSpGEMM_workEstimation(handle_, + opA_, + opB_, + &alpha, + A_descr_, + B_descr_, + &beta, + C_descr_, + dataType_, + alg_, + spgemmDescr_, + &bufferSize1, + nullptr)); + + void* dBuffer1 = nullptr; + cudaCheckError(cudaMalloc((void**)&dBuffer1, bufferSize1)); + + cusparseCheckError(cusparseSpGEMM_workEstimation(handle_, + opA_, + opB_, + &alpha, + A_descr_, + B_descr_, + &beta, + C_descr_, + dataType_, + alg_, + spgemmDescr_, + &bufferSize1, + dBuffer1)); + + int64_t num_prods; + cusparseCheckError(cusparseSpGEMM_getNumProducts(spgemmDescr_, &num_prods)); + + size_t bufferSize3 = 0; + cusparseCheckError(cusparseSpGEMM_estimateMemory(handle_, + opA_, + opB_, + &alpha, + A_descr_, + B_descr_, + &beta, + C_descr_, + dataType_, + alg_, + spgemmDescr_, + 0.2, + &bufferSize3, + nullptr, + nullptr)); + + void* dBuffer3 = nullptr; + cudaCheckError(cudaMalloc((void**)&dBuffer3, bufferSize3)); + size_t bufferSize2 = 0; + cusparseCheckError(cusparseSpGEMM_estimateMemory(handle_, + opA_, + opB_, + &alpha, + A_descr_, + B_descr_, + &beta, + C_descr_, + dataType_, + alg_, + spgemmDescr_, + 0.2, + &bufferSize3, + dBuffer3, + &bufferSize2)); + + void* dBuffer2 = nullptr; + cudaCheckError(cudaMalloc((void**)&dBuffer2, bufferSize2)); + + cusparseCheckError(cusparseSpGEMM_compute(handle_, + opA_, + opB_, + &alpha, + A_descr_, + B_descr_, + &beta, + C_descr_, + dataType_, + alg_, + spgemmDescr_, + &bufferSize2, + dBuffer2)); + + cusparseCheckError(cusparseSpMatGetSize(C_descr_, + &C_num_rows_, + &C_num_cols_, + &C_nnz_)); + + cudaCheckError(cudaMalloc(&C_vals_dev_, sizeof(T) * C_nnz_)); + cudaCheckError(cudaMalloc(&C_cols_dev_, sizeof(int32_t) * C_nnz_)); + + cusparseCheckError(cusparseCsrSetPointers(C_descr_, + C_rows_dev_, + C_cols_dev_, + C_vals_dev_)); + + cusparseCheckError(cusparseSpGEMM_copy(handle_, + opA_, + opB_, + &alpha, + A_descr_, + B_descr_, + &beta, + C_descr_, + dataType_, + alg_, + spgemmDescr_)); + + // Freeing memory + cudaCheckError(cudaFree(dBuffer1)); + cudaCheckError(cudaFree(dBuffer2)); + cudaCheckError(cudaFree(dBuffer3)); + cusparseCheckError(cusparseSpGEMM_destroyDescr(spgemmDescr_)); + cusparseCheckError(cusparseDestroySpMat(A_descr_)); + cusparseCheckError(cusparseDestroySpMat(B_descr_)); + cusparseCheckError(cusparseDestroySpMat(C_descr_)); + + C_vals_ = (T*)malloc(sizeof(T) * C_nnz_); + C_cols_32_ = (int32_t*)malloc(sizeof(int32_t) * C_nnz_); + C_allocated = true; + + cudaCheckError(cudaMemcpyAsync(C_rows_32_, C_rows_dev_, sizeof(int32_t) * (m_ + 1), cudaMemcpyDeviceToHost, s1_)); + cudaCheckError(cudaMemcpyAsync(C_cols_32_, C_cols_dev_, sizeof(int32_t) * C_nnz_, cudaMemcpyDeviceToHost, s2_)); + cudaCheckError(cudaMemcpyAsync(C_vals_, C_vals_dev_, sizeof(T) * C_nnz_, cudaMemcpyDeviceToHost, s3_)); + cudaCheckError(cudaDeviceSynchronize()); + + cudaCheckError(cudaFree(C_vals_dev_)); + cudaCheckError(cudaFree(C_cols_dev_)); + break; + } + case gpuOffloadType::once: { + if (C_allocated) { + cudaCheckError(cudaFree(C_vals_dev_)); + cudaCheckError(cudaFree(C_cols_dev_)); + C_allocated = false; + } + // Make matrix descriptors + cusparseCheckError(cusparseCreateCsr(&A_descr_, + m_, + k_, + A_nnz_, + A_rows_dev_, + A_cols_dev_, + A_vals_dev_, + index_, + index_, + base_, + dataType_)); + cusparseCheckError(cusparseCreateCsr(&B_descr_, + k_, + n_, + B_nnz_, + B_rows_dev_, + B_cols_dev_, + B_vals_dev_, + index_, + index_, + base_, + dataType_)); + cusparseCheckError(cusparseCreateCsr(&C_descr_, + m_, + n_, + 0, + nullptr, + nullptr, + nullptr, + index_, + index_, + base_, + dataType_)); + + cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDescr_)); + + size_t bufferSize1 = 0; + cusparseCheckError(cusparseSpGEMM_workEstimation(handle_, + opA_, + opB_, + &alpha, + A_descr_, + B_descr_, + &beta, + C_descr_, + dataType_, + alg_, + spgemmDescr_, + &bufferSize1, + nullptr)); + + void* dBuffer1 = nullptr; + cudaCheckError(cudaMalloc((void**)&dBuffer1, bufferSize1)); + + cusparseCheckError(cusparseSpGEMM_workEstimation(handle_, + opA_, + opB_, + &alpha, + A_descr_, + B_descr_, + &beta, + C_descr_, + dataType_, + alg_, + spgemmDescr_, + &bufferSize1, + dBuffer1)); + + int64_t num_prods; + cusparseCheckError(cusparseSpGEMM_getNumProducts(spgemmDescr_, &num_prods)); + + size_t bufferSize3 = 0; + cusparseCheckError(cusparseSpGEMM_estimateMemory(handle_, + opA_, + opB_, + &alpha, + A_descr_, + B_descr_, + &beta, + C_descr_, + dataType_, + alg_, + spgemmDescr_, + 0.2, + &bufferSize3, + nullptr, + nullptr)); + + void* dBuffer3 = nullptr; + cudaCheckError(cudaMalloc((void**)&dBuffer3, bufferSize3)); + size_t bufferSize2 = 0; + cusparseCheckError(cusparseSpGEMM_estimateMemory(handle_, + opA_, + opB_, + &alpha, + A_descr_, + B_descr_, + &beta, + C_descr_, + dataType_, + alg_, + spgemmDescr_, + 0.2, + &bufferSize3, + dBuffer3, + &bufferSize2)); + + void* dBuffer2 = nullptr; + cudaCheckError(cudaMalloc((void**)&dBuffer2, bufferSize2)); + + cusparseCheckError(cusparseSpGEMM_compute(handle_, + opA_, + opB_, + &alpha, + A_descr_, + B_descr_, + &beta, + C_descr_, + dataType_, + alg_, + spgemmDescr_, + &bufferSize2, + dBuffer2)); + + cusparseCheckError(cusparseSpMatGetSize(C_descr_, + &C_num_rows_, + &C_num_cols_, + &C_nnz_)); + + cudaCheckError(cudaMalloc(&C_vals_dev_, sizeof(T) * C_nnz_)); + cudaCheckError(cudaMalloc(&C_cols_dev_, sizeof(int32_t) * C_nnz_)); + C_allocated = true; + + cusparseCheckError(cusparseCsrSetPointers(C_descr_, + C_rows_dev_, + C_cols_dev_, + C_vals_dev_)); + + cusparseCheckError(cusparseSpGEMM_copy(handle_, + opA_, + opB_, + &alpha, + A_descr_, + B_descr_, + &beta, + C_descr_, + dataType_, + alg_, + spgemmDescr_)); + + // Freeing memory + cudaCheckError(cudaFree(dBuffer1)); + cudaCheckError(cudaFree(dBuffer2)); + cudaCheckError(cudaFree(dBuffer3)); + cusparseCheckError(cusparseSpGEMM_destroyDescr(spgemmDescr_)); + cusparseCheckError(cusparseDestroySpMat(A_descr_)); + cusparseCheckError(cusparseDestroySpMat(B_descr_)); + cusparseCheckError(cusparseDestroySpMat(C_descr_)); + break; + } + case gpuOffloadType::unified: { + if (C_allocated) { + cudaCheckError(cudaFree(C_cols_32_)); + cudaCheckError(cudaFree(C_vals_)); + C_allocated = false; + } + + // Make matrix descriptors + cusparseCheckError(cusparseCreateCsr(&A_descr_, + m_, + k_, + A_nnz_, + A_rows_, + A_cols_, + A_vals_, + index_, + index_, + base_, + dataType_)); + cusparseCheckError(cusparseCreateCsr(&B_descr_, + k_, + n_, + B_nnz_, + B_rows_, + B_cols_, + B_vals_, + index_, + index_, + base_, + dataType_)); + cusparseCheckError(cusparseCreateCsr(&C_descr_, + m_, + n_, + 0, + nullptr, + nullptr, + nullptr, + index_, + index_, + base_, + dataType_)); + + cusparseCheckError(cusparseSpGEMM_createDescr(&spgemmDescr_)); + + size_t bufferSize1 = 0; + cusparseCheckError(cusparseSpGEMM_workEstimation(handle_, + opA_, + opB_, + &alpha, + A_descr_, + B_descr_, + &beta, + C_descr_, + dataType_, + alg_, + spgemmDescr_, + &bufferSize1, + nullptr)); + + void* dBuffer1 = nullptr; + cudaCheckError(cudaMalloc((void**)&dBuffer1, bufferSize1)); + + cusparseCheckError(cusparseSpGEMM_workEstimation(handle_, + opA_, + opB_, + &alpha, + A_descr_, + B_descr_, + &beta, + C_descr_, + dataType_, + alg_, + spgemmDescr_, + &bufferSize1, + dBuffer1)); + + int64_t num_prods; + cusparseCheckError(cusparseSpGEMM_getNumProducts(spgemmDescr_, &num_prods)); + + size_t bufferSize3 = 0; + cusparseCheckError(cusparseSpGEMM_estimateMemory(handle_, + opA_, + opB_, + &alpha, + A_descr_, + B_descr_, + &beta, + C_descr_, + dataType_, + alg_, + spgemmDescr_, + 0.2, + &bufferSize3, + nullptr, + nullptr)); + + void* dBuffer3 = nullptr; + cudaCheckError(cudaMalloc((void**)&dBuffer3, bufferSize3)); + size_t bufferSize2 = 0; + cusparseCheckError(cusparseSpGEMM_estimateMemory(handle_, + opA_, + opB_, + &alpha, + A_descr_, + B_descr_, + &beta, + C_descr_, + dataType_, + alg_, + spgemmDescr_, + 0.2, + &bufferSize3, + dBuffer3, + &bufferSize2)); + + void* dBuffer2 = nullptr; + cudaCheckError(cudaMalloc((void**)&dBuffer2, bufferSize2)); + + cusparseCheckError(cusparseSpGEMM_compute(handle_, + opA_, + opB_, + &alpha, + A_descr_, + B_descr_, + &beta, + C_descr_, + dataType_, + alg_, + spgemmDescr_, + &bufferSize2, + dBuffer2)); + + cusparseCheckError(cusparseSpMatGetSize(C_descr_, + &C_num_rows_, + &C_num_cols_, + &C_nnz_)); + + cudaCheckError(cudaMallocManaged(&C_vals_, sizeof(T) * C_nnz_)); + cudaCheckError(cudaMallocManaged(&C_cols_32_, sizeof(int32_t) * C_nnz_)); + C_allocated = true; + + cusparseCheckError(cusparseCsrSetPointers(C_descr_, + C_rows_32_, + C_cols_32_, + C_vals_)); + + cusparseCheckError(cusparseSpGEMM_copy(handle_, + opA_, + opB_, + &alpha, + A_descr_, + B_descr_, + &beta, + C_descr_, + dataType_, + alg_, + spgemmDescr_)); + + // Freeing memory + cudaCheckError(cudaFree(dBuffer1)); + cudaCheckError(cudaFree(dBuffer2)); + cudaCheckError(cudaFree(dBuffer3)); + cusparseCheckError(cusparseSpGEMM_destroyDescr(spgemmDescr_)); + cusparseCheckError(cusparseDestroySpMat(A_descr_)); + cusparseCheckError(cusparseDestroySpMat(B_descr_)); + cusparseCheckError(cusparseDestroySpMat(C_descr_)); + cudaCheckError(cudaDeviceSynchronize()); + break; + } + } + } + + /** Perform any required steps after calling the GEMM kernel that should + * be timed. */ + void postLoopRequirements() override { + switch(offload_) { + case gpuOffloadType::always: { + break; + } + case gpuOffloadType::once: { + C_vals_ = (T*)malloc(sizeof(T) * C_nnz_); + C_cols_32_ = (int32_t*)malloc(sizeof(int32_t) * C_nnz_); + + cudaCheckError(cudaMemcpyAsync(C_rows_32_, C_rows_dev_, sizeof(int32_t) * (m_ + 1), cudaMemcpyDeviceToHost, s1_)); + cudaCheckError(cudaMemcpyAsync(C_cols_32_, C_cols_dev_, sizeof(int32_t) * C_nnz_, cudaMemcpyDeviceToHost, s2_)); + cudaCheckError(cudaMemcpyAsync(C_vals_, C_vals_dev_, sizeof(T) * C_nnz_, cudaMemcpyDeviceToHost, s3_)); + cudaCheckError(cudaDeviceSynchronize()); + + if (C_allocated) { + cudaCheckError(cudaFree(C_vals_dev_)); + cudaCheckError(cudaFree(C_cols_dev_)); + C_allocated = false; + } + break; + } + case gpuOffloadType::unified: { + cudaCheckError(cudaMemPrefetchAsync(C_vals_, sizeof(T) * C_nnz_, cudaCpuDeviceId, s1_)); + cudaCheckError(cudaMemPrefetchAsync(C_cols_32_, sizeof(int32_t) * C_nnz_, cudaCpuDeviceId, s2_)); + cudaCheckError(cudaMemPrefetchAsync(C_rows_32_, sizeof(int32_t) * (m_ + 1), cudaCpuDeviceId, s3_)); + break; + } + } + } + + /** Do any necessary cleanup (free pointers, close library handles, etc.) + * after Kernel has been called. */ + void postCallKernelCleanup() override { + switch (offload_) { + case gpuOffloadType::always: { + if (C_allocated) { + free(C_vals_); + free(C_cols_32_); + C_allocated = false; + } + free(A_vals_); + free(A_cols_); + free(A_rows_); + free(B_vals_); + free(B_cols_); + free(B_rows_); + free(C_rows_32_); + + cudaCheckError(cudaFree(A_vals_dev_)); + cudaCheckError(cudaFree(A_cols_dev_)); + cudaCheckError(cudaFree(A_rows_dev_)); + cudaCheckError(cudaFree(B_vals_dev_)); + cudaCheckError(cudaFree(B_cols_dev_)); + cudaCheckError(cudaFree(B_rows_dev_)); + cudaCheckError(cudaFree(C_rows_dev_)); + break; + } + case gpuOffloadType::once: { + free(A_vals_); + free(A_cols_); + free(A_rows_); + free(B_vals_); + free(B_cols_); + free(B_rows_); + free(C_vals_); + free(C_cols_32_); + free(C_rows_32_); + + cudaCheckError(cudaFree(A_vals_dev_)); + cudaCheckError(cudaFree(A_cols_dev_)); + cudaCheckError(cudaFree(A_rows_dev_)); + cudaCheckError(cudaFree(B_vals_dev_)); + cudaCheckError(cudaFree(B_cols_dev_)); + cudaCheckError(cudaFree(B_rows_dev_)); + cudaCheckError(cudaFree(C_rows_dev_)); + break; + } + case gpuOffloadType::unified: { + if (C_allocated) { + cudaCheckError(cudaFree(C_vals_)); + cudaCheckError(cudaFree(C_cols_32_)); + C_allocated = false; + } + cudaCheckError(cudaFree(A_vals_)); + cudaCheckError(cudaFree(A_cols_)); + cudaCheckError(cudaFree(A_rows_)); + cudaCheckError(cudaFree(B_vals_)); + cudaCheckError(cudaFree(B_cols_)); + cudaCheckError(cudaFree(B_rows_)); + cudaCheckError(cudaFree(C_rows_32_)); + + free(A_vals_store_); + free(A_cols_store_); + free(A_rows_store_); + free(B_vals_store_); + free(B_cols_store_); + free(B_rows_store_); + break; + } + } + } + + bool alreadyInitialised_ = false; + + /** Handle used when calling cuBLAS. */ + cusparseHandle_t handle_; + + /** CUDA Streams - used to asynchronously move data between host and device. */ + cudaStream_t s1_; + cudaStream_t s2_; + cudaStream_t s3_; + cudaStream_t s4_; + cudaStream_t s5_; + cudaStream_t s6_; + + /** The ID of the target GPU Device. */ + int gpuDevice_; + + /** Storage for matrices between offload type calls */ + T* A_vals_store_; + int32_t* A_cols_store_; + int32_t* A_rows_store_; + T* B_vals_store_; + int32_t* B_cols_store_; + int32_t* B_rows_store_; + + /** CSR format vectors for matrices A, B and C on the host */ + T* A_vals_; + int32_t* A_cols_; + int32_t* A_rows_; + + T* B_vals_; + int32_t* B_cols_; + int32_t* B_rows_; + + int64_t C_num_rows_; + int64_t C_num_cols_; + + /** CSR format vectors for matrices A, B and C on the device. */ + T* A_vals_dev_; + T* B_vals_dev_; + T* C_vals_dev_; + int32_t* A_cols_dev_; + int32_t* A_rows_dev_; + int32_t* B_cols_dev_; + int32_t* B_rows_dev_; + int32_t* C_cols_dev_; + int32_t* C_rows_dev_; + + int32_t* C_cols_32_; + int32_t* C_rows_32_; + + bool C_allocated = false; + + /** The constant value Alpha. */ + const T alpha = ALPHA; + + /** The constant value Beta. */ + const T beta = BETA; + + // Create descriptors for matrices A->C + cusparseSpMatDescr_t A_descr_, B_descr_, C_descr_; + + cusparseSpGEMMDescr_t spgemmDescr_; + + cusparseOperation_t opA_; + cusparseOperation_t opB_; + cusparseSpGEMMAlg_t alg_; + cusparseIndexType_t index_; + cusparseIndexBase_t base_; + cudaDataType_t dataType_; +}; +} // namespace gpu +#endif \ No newline at end of file diff --git a/include/.DS_Store b/include/.DS_Store new file mode 100644 index 0000000..869e02c Binary files /dev/null and b/include/.DS_Store differ diff --git a/include/doGemm.hh b/include/doGemm.hh index c1aa742..55a6384 100644 --- a/include/doGemm.hh +++ b/include/doGemm.hh @@ -32,21 +32,22 @@ template class doGemm { public: doGemm(const std::string csvDir, const int iters, const int startDim, - const int upperLimit, const bool cpuEnabled = true, + const int upperLimit, const int step, const bool cpuEnabled = true, const bool gpuEnabled = true) : CSV_DIR(csvDir), iterations_(iters), startDimention_(startDim), upperLimit_(upperLimit), + step_(step), doCPU_(cpuEnabled), doGPU_(gpuEnabled) #if CPU_ENABLED , - gemmCpu_(iterations_) + cpu_(iterations_) #endif #if GPU_ENABLED , - gemmGpu_(iterations_) + gpu_(iterations_) #endif { static_assert((std::is_same_v || std::is_same_v) && @@ -57,7 +58,7 @@ class doGemm { /** Run all problem types and write data to CSV files. */ void collectData() { // Square Problem Sizes... - // Re-initialise offload threshold structures & previous results + // Re-initialise offload threshold structures cpuGpu_always_ = cpuGpu_offloadThreshold(); cpuGpu_once_ = cpuGpu_offloadThreshold(); cpuGpu_unified_ = cpuGpu_offloadThreshold(); @@ -66,7 +67,7 @@ class doGemm { prev_gpuResult_unified = time_checksum_gflop(); std::ofstream csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + "_square_square_M=N=K.csv"); - for (int dim = startDimention_; dim <= upperLimit_; dim++) { + for (int dim = startDimention_; dim <= upperLimit_; dim += step_) { // M = dim, N = dim, K = dim; callKernels(csvFile, dim, dim, dim); } @@ -79,218 +80,217 @@ class doGemm { } #endif - // Rectangular Problem Sizes: - // Tall and thin x Short and wide - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_tall-thin_short-wide_M=N_M=16K.csv"); - int K = startDimention_; - int M = 16 * K; - int N = 16 * K; - while (M <= upperLimit_) { - callKernels(csvFile, M, N, K); - M += 16; - N += 16; - K++; - } - // Close file - csvFile.close(); + // Rectangular Problem Sizes: + // Tall and thin x Short and wide + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_tall-thin_short-wide_M=N_M=16K.csv"); + int K = startDimention_; + int M = 16 * K; + int N = 16 * K; + while (M <= upperLimit_) { + callKernels(csvFile, M, N, K); + M += 16 * step_; + N += 16 * step_; + K += step_; + } + // Close file + csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, M=16K)"); - } + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, M=16K)"); + } #endif - // Tall and thin x Short and wide - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_tall-thin_short-wide_M=N_K=32.csv"); - if (upperLimit_ >= 32) { - for (int dim = startDimention_; dim <= upperLimit_; dim++) { - // M = dim, N = dim, K = 32; - callKernels(csvFile, dim, dim, 32); - } + // Tall and thin x Short and wide + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_tall-thin_short-wide_M=N_K=32.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim += step_) { + // M = dim, N = dim, K = 32; + callKernels(csvFile, dim, dim, 32); } - // Close file - csvFile.close(); + } + // Close file + csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, K=32)"); - } + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, K=32)"); + } #endif - // Short and wide x Tall and thin - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_short-wide_tall-thin_M=N_K=16M.csv"); - M = startDimention_; - N = startDimention_; - K = 16 * M; - while (K <= upperLimit_) { - callKernels(csvFile, M, N, K); - M++; - N++; - K += 16; - } - // Close file - csvFile.close(); + // Short and wide x Tall and thin + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_short-wide_tall-thin_M=N_K=16M.csv"); + M = startDimention_; + N = startDimention_; + K = 16 * M; + while (K <= upperLimit_) { + callKernels(csvFile, M, N, K); + M += step_; + N += step_; + K += 16 * step_; + } + // Close file + csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N, K=16M)"); - } + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N, K=16M)"); + } #endif - // Short and wide x Tall and thin - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_short-wide_tall-thin_M=N=32_K.csv"); - if (upperLimit_ >= 32) { - for (int dim = startDimention_; dim <= upperLimit_; dim++) { - // M = 32, N = 32, K = dim; - callKernels(csvFile, 32, 32, dim); - } + // Short and wide x Tall and thin + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_short-wide_tall-thin_M=N=32_K.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim += step_) { + // M = 32, N = 32, K = dim; + callKernels(csvFile, 32, 32, dim); } - // Close file - csvFile.close(); + } + // Close file + csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N=32, K)"); - } + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N=32, K)"); + } #endif - // Tall and Thin x Square - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_tall-thin_square_K=N_M=16K.csv"); - K = startDimention_; - N = startDimention_; - M = 16 * K; - while (M <= upperLimit_) { - callKernels(csvFile, M, N, K); - M += 16; - N++; - K++; - } - // Close file - csvFile.close(); + // Tall and Thin x Square + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_tall-thin_square_K=N_M=16K.csv"); + K = startDimention_; + N = startDimention_; + M = 16 * K; + while (M <= upperLimit_) { + callKernels(csvFile, M, N, K); + M += 16 * step_; + N += step_; + K += step_; + } + // Close file + csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Tall-and-Thin x Square (K=N, M=16K)"); - } + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Tall-and-Thin x Square (K=N, M=16K)"); + } #endif - // Tall and Thin x Square - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_tall-thin_square_K=N=32_M.csv"); - if (upperLimit_ >= 32) { - for (int dim = startDimention_; dim <= upperLimit_; dim++) { - // M = dim, N = 32, K = 32; - callKernels(csvFile, dim, 32, 32); - } + // Tall and Thin x Square + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_tall-thin_square_K=N=32_M.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim += step_) { + // M = dim, N = 32, K = 32; + callKernels(csvFile, dim, 32, 32); } - // Close file - csvFile.close(); + } + // Close file + csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Tall-and-Thin x Square (M, K=N=32)"); - } + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Tall-and-Thin x Square (M, K=N=32)"); + } #endif - // Square x Short and Wide - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_square_short-wide_M=K_N=16K.csv"); - M = startDimention_; - K = startDimention_; - N = 16 * K; - while (N <= upperLimit_) { - callKernels(csvFile, M, N, K); - M++; - N += 16; - K++; - } - // Close file - csvFile.close(); + // Square x Short and Wide + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_square_short-wide_M=K_N=16K.csv"); + M = startDimention_; + K = startDimention_; + N = 16 * K; + while (N <= upperLimit_) { + callKernels(csvFile, M, N, K); + M += step_; + N += 16 * step_; + K += step_; + } + // Close file + csvFile.close(); #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Square x Short-and-Wide (M=K, N=16K)"); - } + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Square x Short-and-Wide (M=K, N=16K)"); + } #endif - - // Square x Short and Wide - // Re-initialise offload threshold structures & previous results - cpuGpu_always_ = cpuGpu_offloadThreshold(); - cpuGpu_once_ = cpuGpu_offloadThreshold(); - cpuGpu_unified_ = cpuGpu_offloadThreshold(); - prev_gpuResult_always = time_checksum_gflop(); - prev_gpuResult_once = time_checksum_gflop(); - prev_gpuResult_unified = time_checksum_gflop(); - csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + - "_square_short-wide_M=K=32_N.csv"); - if (upperLimit_ >= 32) { - for (int dim = startDimention_; dim <= upperLimit_; dim++) { - // M = 32, N = dim, K = 32; - callKernels(csvFile, 32, dim, 32); - } + // Square x Short and Wide + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_square_short-wide_M=K=32_N.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim += step_) { + // M = 32, N = dim, K = 32; + callKernels(csvFile, 32, dim, 32); } - // Close file - csvFile.close(); + } #if CPU_ENABLED && GPU_ENABLED - if (doCPU_ && doGPU_) { - // Print offload results to stdout - printOffloadThreshold("Square x Short-and-Wide (M=K=32, N)"); - } + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Square x Short-and-Wide (M=K=32, N)"); + } #endif + // Close file + csvFile.close(); } private: @@ -301,55 +301,54 @@ class doGemm { const uint64_t flops = calcFlops(M, N, K); std::string kernelName = getKernelName(); - time_checksum_gflop cpuResult; - time_checksum_gflop gpuResult_once; - time_checksum_gflop gpuResult_always; - time_checksum_gflop gpuResult_unified; - // Perform CPU kernel #if CPU_ENABLED + time_checksum_gflop cpuResult; if (doCPU_) { - gemmCpu_.initialise(M, N, K); - cpuResult = gemmCpu_.compute(); + cpu_.initialise(M, N, K); + cpuResult = cpu_.compute(); cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); // Write result to CSV file - writeLineToCsv(csvFile, "cpu", kernelName, M, N, K, probSize, iterations_, - cpuResult.runtime, cpuResult.gflops); + writeLineToCsv(csvFile, "cpu", kernelName, M, N, K, probSize, + 0.0, iterations_, cpuResult.runtime, cpuResult.gflops); } #endif // Perform the GPU kernels #if GPU_ENABLED + time_checksum_gflop gpuResult_once; + time_checksum_gflop gpuResult_always; + time_checksum_gflop gpuResult_unified; if (doGPU_) { // - ONCE : Offload to/from GPU once before all iterations and once // after - gemmGpu_.initialise(gpuOffloadType::once, M, N, K); - gpuResult_once = gemmGpu_.compute(); + gpu_.initialise(gpuOffloadType::once, M, N, K); + gpuResult_once = gpu_.compute(); gpuResult_once.gflops = calcGflops(flops, iterations_, gpuResult_once.runtime); // - ALWAYS: Offload to/from GPU every iteration - gemmGpu_.initialise(gpuOffloadType::always, M, N, K); - gpuResult_always = gemmGpu_.compute(); + gpu_.initialise(gpuOffloadType::always, M, N, K); + gpuResult_always = gpu_.compute(); gpuResult_always.gflops = calcGflops(flops, iterations_, gpuResult_always.runtime); // - UNIFIED : data passed from host to device (and device to host) as // needed - gemmGpu_.initialise(gpuOffloadType::unified, M, N, K); - gpuResult_unified = gemmGpu_.compute(); + gpu_.initialise(gpuOffloadType::unified, M, N, K); + gpuResult_unified = gpu_.compute(); gpuResult_unified.gflops = calcGflops(flops, iterations_, gpuResult_unified.runtime); // Write results to CSV file writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, M, N, K, probSize, - iterations_, gpuResult_once.runtime, + 0.0, iterations_, gpuResult_once.runtime, gpuResult_once.gflops); writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, M, N, K, - probSize, iterations_, gpuResult_always.runtime, + probSize, 0.0, iterations_, gpuResult_always.runtime, gpuResult_always.gflops); writeLineToCsv(csvFile, "gpu_unified", kernelName, M, N, K, probSize, - iterations_, gpuResult_unified.runtime, + 0.0, iterations_, gpuResult_unified.runtime, gpuResult_unified.gflops); } #endif @@ -386,8 +385,8 @@ class doGemm { void checkChecksums(time_checksum_gflop cpuResult, time_checksum_gflop gpuResult_once, time_checksum_gflop gpuResult_always, - time_checksum_gflop gpuResult_unified, const int M, - const int N, const int K) { + time_checksum_gflop gpuResult_unified, + const int M, const int N, const int K) { // Ensure that each checksum difference is less than 0.1% double hundredOverChecksum = 100 / std::fabs(cpuResult.checksum); if (((std::fabs(cpuResult.checksum - gpuResult_once.checksum) * @@ -396,21 +395,12 @@ class doGemm { hundredOverChecksum)) > 0.1 && ((std::fabs(cpuResult.checksum - gpuResult_unified.checksum) * hundredOverChecksum)) > 0.1) { - std::cerr << "ERROR - " << getKernelName() - << " kernel checksums do not match:\n\tInput " - "dimensions: M=" - << M << ", N=" << N << ", K=" << K << std::endl; - std::cerr << std::setprecision(10) - << "\tCPU Checksum = " << cpuResult.checksum << std::endl; - std::cerr << std::setprecision(10) - << "\tGPU (Once) Checksum = " << gpuResult_once.checksum - << std::endl; - std::cerr << std::setprecision(10) - << "\tGPU (Always) Checksum = " << gpuResult_always.checksum - << std::endl; - std::cerr << std::setprecision(10) - << "\tGPU (Unified) Checksum = " << gpuResult_unified.checksum - << std::endl; + std::cerr << "ERROR - " << getKernelName() << " kernel checksums do not match:\n\tInput " + "dimensions: M=" << M << ", N=" << N << ", K=" << K << std::endl; + std::cerr << std::setprecision(10) << "\tCPU Checksum = " << cpuResult.checksum << std::endl; + std::cerr << std::setprecision(10) << "\tGPU (Once) Checksum = " << gpuResult_once.checksum << std::endl; + std::cerr << std::setprecision(10) << "\tGPU (Always) Checksum = " << gpuResult_always.checksum << std::endl; + std::cerr << std::setprecision(10) << "\tGPU (Unified) Checksum = " << gpuResult_unified.checksum << std::endl; exit(1); } } @@ -524,7 +514,7 @@ class doGemm { } /** Print to stdout the offload thresholds. */ - void printOffloadThreshold(std::string problemName) const { + void printOffloadThreshold(const std::string& problemName) const { std::vector header = { "Device", "M", "N", "K", "Total Prob. Size (KiB)", "GFLOP/s", "CPU GFLOP/s"}; @@ -534,8 +524,7 @@ class doGemm { std::stringstream probSize_o; std::stringstream gpuGflops_o; std::stringstream cpuGflops_o; - probSize_o << std::fixed << std::setprecision(2) - << cpuGpu_once_.probSize_kib; + probSize_o << std::fixed << std::setprecision(2) << cpuGpu_once_.probSize_kib; gpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.gpuGflops; cpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.cpuGflops; if (cpuGpu_once_.M == 0) { @@ -554,12 +543,9 @@ class doGemm { std::stringstream probSize_a; std::stringstream gpuGflops_a; std::stringstream cpuGflops_a; - probSize_a << std::fixed << std::setprecision(2) - << cpuGpu_always_.probSize_kib; - gpuGflops_a << std::fixed << std::setprecision(2) - << cpuGpu_always_.gpuGflops; - cpuGflops_a << std::fixed << std::setprecision(2) - << cpuGpu_always_.cpuGflops; + probSize_a << std::fixed << std::setprecision(2) << cpuGpu_always_.probSize_kib; + gpuGflops_a << std::fixed << std::setprecision(2) << cpuGpu_always_.gpuGflops; + cpuGflops_a << std::fixed << std::setprecision(2) << cpuGpu_always_.cpuGflops; if (cpuGpu_always_.M == 0) { // No offload threshold found rows.push_back({"GPU (Offload Always)", std::to_string(0), @@ -576,12 +562,9 @@ class doGemm { std::stringstream probSize_u; std::stringstream gpuGflops_u; std::stringstream cpuGflops_u; - probSize_u << std::fixed << std::setprecision(2) - << cpuGpu_unified_.probSize_kib; - gpuGflops_u << std::fixed << std::setprecision(2) - << cpuGpu_unified_.gpuGflops; - cpuGflops_u << std::fixed << std::setprecision(2) - << cpuGpu_unified_.cpuGflops; + probSize_u << std::fixed << std::setprecision(2) << cpuGpu_unified_.probSize_kib; + gpuGflops_u << std::fixed << std::setprecision(2) << cpuGpu_unified_.gpuGflops; + cpuGflops_u << std::fixed << std::setprecision(2) << cpuGpu_unified_.cpuGflops; if (cpuGpu_unified_.M == 0) { // No offload threshold found rows.push_back({"GPU (Unified Memory)", std::to_string(0), @@ -606,12 +589,15 @@ class doGemm { /** The number of iterations to perform per problem size. */ const int iterations_; - /** The value of the first probelm size dimention run. */ + /** The value of the first problem size dimension run. */ const int startDimention_; - /** The maximum value of the largest problem size dimention. */ + /** The maximum value of the largest problem size dimension. */ const int upperLimit_; + /** The step size between each problem size dimension. */ + const int step_; + /** Whether the CPU kernels should be run. */ const bool doCPU_ = true; @@ -620,12 +606,12 @@ class doGemm { #if CPU_ENABLED /** The GEMM CPU kernel. */ - cpu::gemm_cpu gemmCpu_; + cpu::gemm_cpu cpu_; #endif #if GPU_ENABLED /** The GEMM GPU kernel. */ - gpu::gemm_gpu gemmGpu_; + gpu::gemm_gpu gpu_; #endif /** The point at which offloading to GPU (offload once) becomes worthwhile. */ diff --git a/include/doGemv.hh b/include/doGemv.hh index b86aad6..c5edcd1 100644 --- a/include/doGemv.hh +++ b/include/doGemv.hh @@ -32,21 +32,22 @@ template class doGemv { public: doGemv(const std::string csvDir, const int iters, const int startDim, - const int upperLimit, const bool cpuEnabled = true, + const int upperLimit, const int step, const bool cpuEnabled = true, const bool gpuEnabled = true) : CSV_DIR(csvDir), iterations_(iters), startDimention_(startDim), upperLimit_(upperLimit), + step_(step), doCPU_(cpuEnabled), doGPU_(gpuEnabled) #if CPU_ENABLED , - gemvCpu_(iterations_) + cpu_(iterations_) #endif #if GPU_ENABLED , - gemvGpu_(iterations_) + gpu_(iterations_) #endif { static_assert((std::is_same_v || std::is_same_v) && @@ -66,7 +67,7 @@ class doGemv { prev_gpuResult_unified = time_checksum_gflop(); std::ofstream csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + "_square_vector_M=N.csv"); - for (int dim = startDimention_; dim <= upperLimit_; dim++) { + for (int dim = startDimention_; dim <= upperLimit_; dim += step_) { // M = dim, N = dim; callKernels(csvFile, dim, dim); } @@ -94,8 +95,8 @@ class doGemv { int M = 16 * N; while (M <= upperLimit_) { callKernels(csvFile, M, N); - M += 16; - N++; + M += 16 * step_; + N += step_; } // Close file csvFile.close(); @@ -117,7 +118,7 @@ class doGemv { csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + "_tall-thin_vector_M_N=32.csv"); if (upperLimit_ >= 32) { - for (int dim = startDimention_; dim <= upperLimit_; dim++) { + for (int dim = startDimention_; dim <= upperLimit_; dim += step_) { // M = dim, N = 32; callKernels(csvFile, dim, 32); } @@ -145,8 +146,8 @@ class doGemv { N = 16 * M; while (N <= upperLimit_) { callKernels(csvFile, M, N); - M++; - N += 16; + M += step_; + N += 16 * step_; } // Close file csvFile.close(); @@ -168,7 +169,7 @@ class doGemv { csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + "_short-wide_vector_M=32_N.csv"); if (upperLimit_ >= 32) { - for (int dim = startDimention_; dim <= upperLimit_; dim++) { + for (int dim = startDimention_; dim <= upperLimit_; dim += step_) { // M = 32, N = dim; callKernels(csvFile, 32, dim); } @@ -190,55 +191,54 @@ class doGemv { const uint64_t flops = calcFlops(M, N); std::string kernelName = getKernelName(); - time_checksum_gflop cpuResult; - time_checksum_gflop gpuResult_once; - time_checksum_gflop gpuResult_always; - time_checksum_gflop gpuResult_unified; - // Perform CPU kernel #if CPU_ENABLED + time_checksum_gflop cpuResult; if (doCPU_) { - gemvCpu_.initialise(M, N); - cpuResult = gemvCpu_.compute(); + cpu_.initialise(M, N); + cpuResult = cpu_.compute(); cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); // Write result to CSV file - writeLineToCsv(csvFile, "cpu", kernelName, M, N, 0, probSize, iterations_, - cpuResult.runtime, cpuResult.gflops); + writeLineToCsv(csvFile, "cpu", kernelName, M, N, 0, probSize, 0.0, + iterations_, cpuResult.runtime, cpuResult.gflops); } #endif // Perform the GPU kernels #if GPU_ENABLED + time_checksum_gflop gpuResult_once; + time_checksum_gflop gpuResult_always; + time_checksum_gflop gpuResult_unified; if (doGPU_) { // - ONCE : Offload to/from GPU once before all iterations and once // after - gemvGpu_.initialise(gpuOffloadType::once, M, N); - gpuResult_once = gemvGpu_.compute(); + gpu_.initialise(gpuOffloadType::once, M, N); + gpuResult_once = gpu_.compute(); gpuResult_once.gflops = calcGflops(flops, iterations_, gpuResult_once.runtime); // - ALWAYS: Offload to/from GPU every iteration - gemvGpu_.initialise(gpuOffloadType::always, M, N); - gpuResult_always = gemvGpu_.compute(); + gpu_.initialise(gpuOffloadType::always, M, N); + gpuResult_always = gpu_.compute(); gpuResult_always.gflops = calcGflops(flops, iterations_, gpuResult_always.runtime); // - UNIFIED : data passed from host to device (and device to host) as // needed - gemvGpu_.initialise(gpuOffloadType::unified, M, N); - gpuResult_unified = gemvGpu_.compute(); + gpu_.initialise(gpuOffloadType::unified, M, N); + gpuResult_unified = gpu_.compute(); gpuResult_unified.gflops = calcGflops(flops, iterations_, gpuResult_unified.runtime); // Write results to CSV file writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, M, N, 0, probSize, - iterations_, gpuResult_once.runtime, + 0.0, iterations_, gpuResult_once.runtime, gpuResult_once.gflops); writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, M, N, 0, - probSize, iterations_, gpuResult_always.runtime, + probSize, 0.0, iterations_, gpuResult_always.runtime, gpuResult_always.gflops); writeLineToCsv(csvFile, "gpu_unified", kernelName, M, N, 0, probSize, - iterations_, gpuResult_unified.runtime, + 0.0, iterations_, gpuResult_unified.runtime, gpuResult_unified.gflops); } #endif @@ -488,6 +488,9 @@ class doGemv { /** The maximum value of the largest problem size dimention. */ const int upperLimit_; + /** The step size between each problem size dimension. */ + const int step_; + /** Whether the CPU kernels should be run. */ const bool doCPU_ = true; @@ -496,12 +499,12 @@ class doGemv { #if CPU_ENABLED /** The GEMV CPU kernel. */ - cpu::gemv_cpu gemvCpu_; + cpu::gemv_cpu cpu_; #endif #if GPU_ENABLED /** The GEMV GPU kernel. */ - gpu::gemv_gpu gemvGpu_; + gpu::gemv_gpu gpu_; #endif /** The point at which offloading to GPU (offload once) becomes worthwhile. */ diff --git a/include/doSpmdnm.hh b/include/doSpmdnm.hh new file mode 100644 index 0000000..89ab8b1 --- /dev/null +++ b/include/doSpmdnm.hh @@ -0,0 +1,645 @@ +#pragma once +#include +#include +#include + +#include "helpers.hh" +#include "tablePrinter.hh" +#include "utilities.hh" + +#if defined CPU_ARMPL +#include "../ArmPL/spmdnm.hh" +#elif defined CPU_ONEMKL +#include "../oneMKL/CPU/spmdnm.hh" +#elif defined CPU_AOCL +#include "../AOCL/spmdnm.hh" +#endif + +#if defined GPU_CUBLAS +#include "../cuBLAS/spmdnm.hh" +#elif defined GPU_ONEMKL +#include "../oneMKL/GPU/spmdnm.hh" +#elif defined GPU_ROCBLAS +#include "../rocBLAS/spmdnm.hh" +#endif + + +/** +* 'T represents the type of the sparse GEMM kernel that will be run. E.g., + * T=float is for SSpMDnM +*/ +template +class doSpmdnm { +public: + doSpmdnm(const std::string csvDir, const int iters, const int startDim, + const int upperLimit, const int step, const double sparsity, const matrixType type, + const bool cpuEnabled = true, const bool gpuEnabled = true) + : CSV_DIR(csvDir), + iterations_(iters), + startDimention_(startDim), + upperLimit_(upperLimit), + step_(step), + sparsity_(sparsity), + type_(type), + doCPU_(cpuEnabled), + doGPU_(gpuEnabled) +#if CPU_ENABLED + , + cpu_(iterations_) +#endif +#if GPU_ENABLED + , + gpu_(iterations_) +#endif + { + static_assert((std::is_same_v || std::is_same_v) && + "ERROR - doSpmdnm can only be constructed using one of the " + "following types: [float, double]."); + } + + void collectData() { + // Square Problem Sizes... + // Re-initialise offload threshold structures + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + std::ofstream csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_square_square_M=N=K.csv"); + for (int dim = startDimention_; dim <= upperLimit_; dim += step_) { + // M = dim, N = dim, K = dim; + callKernels(csvFile, dim, dim, dim); + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Square x Square (M=N=K)"); + } +#endif + + // Rectangular Problem Sizes: + // Tall and thin x Short and wide + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_tall-thin_short-wide_M=N_M=16K.csv"); + int K = startDimention_; + int M = 16 * K; + int N = 16 * K; + while (M <= upperLimit_) { + callKernels(csvFile, M, N, K); + M += 16 * step_; + N += 16 * step_; + K += step_; + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, M=16K)"); + } +#endif + + // Tall and thin x Short and wide + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_tall-thin_short-wide_M=N_K=32.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim += step_) { + // M = dim, N = dim, K = 32; + callKernels(csvFile, dim, dim, 32); + } + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, K=32)"); + } +#endif + + // Short and wide x Tall and thin + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_short-wide_tall-thin_M=N_K=16M.csv"); + M = startDimention_; + N = startDimention_; + K = 16 * M; + while (K <= upperLimit_) { + callKernels(csvFile, M, N, K); + M += step_; + N += step_; + K += 16 * step_; + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N, K=16M)"); + } +#endif + + // Short and wide x Tall and thin + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_short-wide_tall-thin_M=N=32_K.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim += step_) { + // M = 32, N = 32, K = dim; + callKernels(csvFile, 32, 32, dim); + } + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N=32, K)"); + } +#endif + + // Tall and Thin x Square + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_tall-thin_square_K=N_M=16K.csv"); + K = startDimention_; + N = startDimention_; + M = 16 * K; + while (M <= upperLimit_) { + callKernels(csvFile, M, N, K); + M += 16 * step_; + N += step_; + K += step_; + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Tall-and-Thin x Square (K=N, M=16K)"); + } +#endif + + // Tall and Thin x Square + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_tall-thin_square_K=N=32_M.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim += step_) { + // M = dim, N = 32, K = 32; + callKernels(csvFile, dim, 32, 32); + } + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Tall-and-Thin x Square (M, K=N=32)"); + } +#endif + + // Square x Short and Wide + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_square_short-wide_M=K_N=16K.csv"); + M = startDimention_; + K = startDimention_; + N = 16 * K; + while (N <= upperLimit_) { + callKernels(csvFile, M, N, K); + M += step_; + N += 16 * step_; + K += step_; + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Square x Short-and-Wide (M=K, N=16K)"); + } +#endif + // Square x Short and Wide + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_square_short-wide_M=K=32_N.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim += step_) { + // M = 32, N = dim, K = 32; + callKernels(csvFile, 32, dim, 32); + } + } +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Square x Short-and-Wide (M=K=32, N)"); + } +#endif + // Close file + csvFile.close(); + } + +private: + /** Call the appropriate CPU and GPU GEMM kernels. */ + void callKernels(std::ofstream& csvFile, const int M, const int N, + const int K) { + const double probSize = calcKib(M, N, K); + const uint64_t flops = calcFlops(M, N, K); + std::string kernelName = getKernelName(); + +// Perform CPU kernel +#if CPU_ENABLED + time_checksum_gflop cpuResult; + if (doCPU_) { + cpu_.initialise(M, N, K, sparsity_, type_); + cpuResult = cpu_.compute(); + cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); + // Write result to CSV file + writeLineToCsv(csvFile, "cpu", kernelName, M, N, K, probSize, + sparsity_, iterations_, cpuResult.runtime, cpuResult + .gflops); + } +#endif + +// Perform the GPU kernels +#if GPU_ENABLED + time_checksum_gflop gpuResult_once; + time_checksum_gflop gpuResult_always; + time_checksum_gflop gpuResult_unified; + /* + * We run three different offload types: + * - ALWAYS: Offload to/from GPU every iteration + * - ONCE : Offload to/from GPU once before all iterations and once after + * - UNIFIED : data passed from host to device (and device to host) as needed + * THE ORDER OF THESE IS IMPORTANT -- To reduce time spent generating matrices, we + * generate once during the ALWAYS offload, and then re-use the same matrices for + * the ONCE and UNIFIED offload tests. Deleting them after UNIFIED. Therefore, + * changing the order here will require this logic within the spmm GPU classes to + * be updated. + */ + if (doGPU_) { + // - ALWAYS: Offload to/from GPU every iteration + gpu_.initialise(gpuOffloadType::always, M, N, K, sparsity_, type_); + gpuResult_always = gpu_.compute(); + gpuResult_always.gflops = calcGflops(flops, iterations_, gpuResult_always.runtime); + writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, M, N, K, + probSize, sparsity_, iterations_, gpuResult_always.runtime, + gpuResult_always.gflops); + + // - ONCE : Offload to/from GPU once before all iterations and once + // after + gpu_.initialise(gpuOffloadType::once, M, N, K, sparsity_, type_); + gpuResult_once = gpu_.compute(); + gpuResult_once.gflops = calcGflops(flops, iterations_, gpuResult_once.runtime); + writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, M, N, K, probSize, + sparsity_, iterations_, gpuResult_once.runtime, + gpuResult_once.gflops); + + // - UNIFIED : data passed from host to device (and device to host) as + // needed + gpu_.initialise(gpuOffloadType::unified, M, N, K, sparsity_, type_); + gpuResult_unified = gpu_.compute(); + gpuResult_unified.gflops = calcGflops(flops, iterations_, gpuResult_unified.runtime); + writeLineToCsv(csvFile, "gpu_unified", kernelName, M, N, K, probSize, + sparsity_, iterations_, gpuResult_unified.runtime, + gpuResult_unified.gflops); + } +#endif + +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Make sure all checksums match if CPU and GPU kernels are run. + // - The majority of BLAS Libraries guarentee the same result if a + // function + // is called multiple times. Given all input matrices are identical for + // each GPU offload type, we need only to compare the CPU and GPU + // checksums. + checkChecksums(cpuResult, gpuResult_once, gpuResult_always, + gpuResult_unified, M, N, K); + + // Check if offload structs should be reset + checkOffloadStructReset(cpuResult, gpuResult_once, gpuResult_always, + gpuResult_unified); + + // Check if offload threshold has been achieved for each GPU offload type. + updateOffloadStructs(cpuResult, gpuResult_once, gpuResult_always, + gpuResult_unified, M, N, K, probSize); + + // Update previous results + prev_gpuResult_once = gpuResult_once; + prev_gpuResult_always = gpuResult_always; + prev_gpuResult_unified = gpuResult_unified; + } +#endif + } + + + /** Ensure all CPU and GPU checksums are within the permitted limit of + * eachother. */ + void checkChecksums(time_checksum_gflop cpuResult, + time_checksum_gflop gpuResult_once, + time_checksum_gflop gpuResult_always, + time_checksum_gflop gpuResult_unified, + const int M, const int N, const int K) { + // Ensure that each checksum difference is less than 0.1% + double hundredOverChecksum = 100 / std::fabs(cpuResult.checksum); + if (((std::fabs(cpuResult.checksum - gpuResult_once.checksum) * + hundredOverChecksum)) > 0.1 && + ((std::fabs(cpuResult.checksum - gpuResult_always.checksum) * + hundredOverChecksum)) > 0.1 && + ((std::fabs(cpuResult.checksum - gpuResult_unified.checksum) * + hundredOverChecksum)) > 0.1) { + std::cerr << "ERROR - " << getKernelName() << " kernel checksums do not match:\n\tInput " + "dimensions: M=" << M << ", N=" << N << ", K=" << K << std::endl; + std::cerr << std::setprecision(10) << "\tCPU Checksum = " << cpuResult.checksum << std::endl; + std::cerr << std::setprecision(10) << "\tGPU (Once) Checksum = " << gpuResult_once.checksum << std::endl; + std::cerr << std::setprecision(10) << "\tGPU (Always) Checksum = " << gpuResult_always.checksum << std::endl; + std::cerr << std::setprecision(10) << "\tGPU (Unified) Checksum = " << gpuResult_unified.checksum << std::endl; + exit(1); + } + } + + /** Check whether the offload structures need to be reset; and doing so if + * required. + * - If CPU.gflops >= GPU.gflops for last two problem sizes, then reset + * offload structures as GPU may not necessarily have reached the offload + * threshold. */ + void checkOffloadStructReset(time_checksum_gflop cpuResult, + time_checksum_gflop gpuResult_once, + time_checksum_gflop gpuResult_always, + time_checksum_gflop gpuResult_unified) { + if ((cpuGpu_once_.M != 0) && (cpuResult.gflops >= gpuResult_once.gflops) && + (cpuResult.gflops >= prev_gpuResult_once.gflops)) { + cpuGpu_once_.cpuGflops = 0.0; + cpuGpu_once_.gpuGflops = 0.0; + cpuGpu_once_.probSize_kib = 0.0; + cpuGpu_once_.M = 0; + cpuGpu_once_.N = 0; + cpuGpu_once_.K = 0; + } + if ((cpuGpu_always_.M != 0) && + (cpuResult.gflops >= gpuResult_always.gflops) && + (cpuResult.gflops >= prev_gpuResult_always.gflops)) { + cpuGpu_always_.cpuGflops = 0.0; + cpuGpu_always_.gpuGflops = 0.0; + cpuGpu_always_.probSize_kib = 0.0; + cpuGpu_always_.M = 0; + cpuGpu_always_.N = 0; + cpuGpu_always_.K = 0; + } + if ((cpuGpu_unified_.M != 0) && + (cpuResult.gflops >= gpuResult_unified.gflops) && + (cpuResult.gflops >= prev_gpuResult_unified.gflops)) { + cpuGpu_unified_.cpuGflops = 0.0; + cpuGpu_unified_.gpuGflops = 0.0; + cpuGpu_unified_.probSize_kib = 0.0; + cpuGpu_unified_.M = 0; + cpuGpu_unified_.N = 0; + cpuGpu_unified_.K = 0; + } + } + + /** Update the offload threshold structs if GPU.gflops > CPU.gflops. */ + void updateOffloadStructs(time_checksum_gflop cpuResult, + time_checksum_gflop gpuResult_once, + time_checksum_gflop gpuResult_always, + time_checksum_gflop gpuResult_unified, const int M, + const int N, const int K, const double probSize) { + if ((cpuGpu_once_.M == 0) && cpuResult.gflops < gpuResult_once.gflops) { + cpuGpu_once_.cpuGflops = cpuResult.gflops; + cpuGpu_once_.gpuGflops = gpuResult_once.gflops; + cpuGpu_once_.probSize_kib = probSize; + cpuGpu_once_.M = M; + cpuGpu_once_.N = N; + cpuGpu_once_.K = K; + } + if ((cpuGpu_always_.M == 0) && cpuResult.gflops < gpuResult_always.gflops) { + cpuGpu_always_.cpuGflops = cpuResult.gflops; + cpuGpu_always_.gpuGflops = gpuResult_always.gflops; + cpuGpu_always_.probSize_kib = probSize; + cpuGpu_always_.M = M; + cpuGpu_always_.N = N; + cpuGpu_always_.K = K; + } + if ((cpuGpu_unified_.M == 0) && + cpuResult.gflops < gpuResult_unified.gflops) { + cpuGpu_unified_.cpuGflops = cpuResult.gflops; + cpuGpu_unified_.gpuGflops = gpuResult_unified.gflops; + cpuGpu_unified_.probSize_kib = probSize; + cpuGpu_unified_.M = M; + cpuGpu_unified_.N = N; + cpuGpu_unified_.K = K; + } + } + + /** A function for calculating FLOPs performed by a SpMDnM. + * C = alpha*AB + beta*C */ + constexpr uint64_t calcFlops(const int M, const int N, const int K) const { + // Sparse Matrix x Dense Matrix is just a series of SpMDnV - one for each column + uint64_t NNZ = (uint64_t)((double)M * (double)K * (1.0 - sparsity_)); + return 2 * NNZ * N; + } + + /** A function for calculating the total GEMM problem size in KiB. + Uses a single CSR format matrix: (M+1) + 2NNZ; and two dense matrices */ + constexpr double calcKib(const int M, const int N, const int K) const { + uint64_t NNZ = 1 + (uint64_t)((double)M * (double)K * (1.0 - sparsity_)); + uint64_t M_ = (uint64_t)M, N_ = (uint64_t)N, K_ = (uint64_t)K; + uint64_t probSize = (M_ + 1) + (2 * NNZ) + (K_ * N_) + (M_ * N_); + return ((double)(probSize * (sizeof(T))) / 1024); + } + + /** Get the name of the kernel being run. */ + std::string getKernelName() const { + switch (sizeof(T)) { + case 4: + return "sspmdnm"; + case 8: + return "dspmdnm"; + default: + return "unknown"; + } + } + + /** Print to stdout the offload thresholds. */ + void printOffloadThreshold(const std::string& problemName) const { + std::vector header = { + "Device", "M", "N", "K", "Total Prob. Size (KiB)", + "GFLOP/s", "CPU GFLOP/s"}; + + std::vector> rows; + // Initialise GPU_Once row + std::stringstream probSize_o; + std::stringstream gpuGflops_o; + std::stringstream cpuGflops_o; + probSize_o << std::fixed << std::setprecision(2) << cpuGpu_once_.probSize_kib; + gpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.gpuGflops; + cpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.cpuGflops; + if (cpuGpu_once_.M == 0) { + // No offload threshold found + rows.push_back({"GPU (Offload Once)", std::to_string(0), + std::to_string(0), std::to_string(0), probSize_o.str(), + "N/A", "N/A"}); + } else { + rows.push_back({"GPU (Offload Once)", std::to_string(cpuGpu_once_.M), + std::to_string(cpuGpu_once_.N), + std::to_string(cpuGpu_once_.K), probSize_o.str(), + gpuGflops_o.str(), cpuGflops_o.str()}); + } + + // Initialise GPU_always row + std::stringstream probSize_a; + std::stringstream gpuGflops_a; + std::stringstream cpuGflops_a; + probSize_a << std::fixed << std::setprecision(2) << cpuGpu_always_.probSize_kib; + gpuGflops_a << std::fixed << std::setprecision(2) << cpuGpu_always_.gpuGflops; + cpuGflops_a << std::fixed << std::setprecision(2) << cpuGpu_always_.cpuGflops; + if (cpuGpu_always_.M == 0) { + // No offload threshold found + rows.push_back({"GPU (Offload Always)", std::to_string(0), + std::to_string(0), std::to_string(0), probSize_a.str(), + "N/A", "N/A"}); + } else { + rows.push_back({"GPU (Offload Always)", std::to_string(cpuGpu_always_.M), + std::to_string(cpuGpu_always_.N), + std::to_string(cpuGpu_always_.K), probSize_a.str(), + gpuGflops_a.str(), cpuGflops_a.str()}); + } + + // Initialise GPU_unified row + std::stringstream probSize_u; + std::stringstream gpuGflops_u; + std::stringstream cpuGflops_u; + probSize_u << std::fixed << std::setprecision(2) << cpuGpu_unified_.probSize_kib; + gpuGflops_u << std::fixed << std::setprecision(2) << cpuGpu_unified_.gpuGflops; + cpuGflops_u << std::fixed << std::setprecision(2) << cpuGpu_unified_.cpuGflops; + if (cpuGpu_unified_.M == 0) { + // No offload threshold found + rows.push_back({"GPU (Unified Memory)", std::to_string(0), + std::to_string(0), std::to_string(0), probSize_u.str(), + "N/A", "N/A"}); + } else { + rows.push_back({"GPU (Unified Memory)", std::to_string(cpuGpu_unified_.M), + std::to_string(cpuGpu_unified_.N), + std::to_string(cpuGpu_unified_.K), probSize_u.str(), + gpuGflops_u.str(), cpuGflops_u.str()}); + } + + // Print table + tablePrinter tPrinter( + problemName + " Problem Domian GPU Offload Thresholds:", header, rows); + tPrinter.print(1); + } + + /** The output directory where CSV files should be saved to. */ + const std::string CSV_DIR; + + /** The number of iterations to perform per problem size. */ + const int iterations_; + + /** The value of the first problem size dimension run. */ + const int startDimention_; + + /** The maximum value of the largest problem size dimension. */ + const int upperLimit_; + + /** The step size between each problem size dimension. */ + const int step_; + + /** The sparsity value of the sparse matrix. */ + const double sparsity_; + + const matrixType type_ = matrixType::rmat; + + /** Whether the CPU kernels should be run. */ + const bool doCPU_ = true; + + /** Whether the GPU kernels should be run. */ + const bool doGPU_ = true; + + +#if CPU_ENABLED + /** The SpMDnM CPU kernel. */ + cpu::spmdnm_cpu cpu_; +#endif + +#if GPU_ENABLED + /** The SpMDnM GPU kernel. */ + gpu::spmdnm_gpu gpu_; +#endif + + /** The point at which offloading to GPU (offload once) becomes worthwhile. */ + cpuGpu_offloadThreshold cpuGpu_once_; + + /** The point at which offloading to GPU (offload always) becomes worthwhile. + */ + cpuGpu_offloadThreshold cpuGpu_always_; + + /** The point at which offloading to GPU (unified memory) becomes worthwhile. + */ + cpuGpu_offloadThreshold cpuGpu_unified_; + + /** The previous problem size's GPU (offload once) performance results. */ + time_checksum_gflop prev_gpuResult_once; + + /** The previous problem size's GPU (offload always) performance results. */ + time_checksum_gflop prev_gpuResult_always; + + /** The previous problem size's GPU (unified memory) performance results. */ + time_checksum_gflop prev_gpuResult_unified; +}; \ No newline at end of file diff --git a/include/doSpmdnv.hh b/include/doSpmdnv.hh new file mode 100644 index 0000000..2871c88 --- /dev/null +++ b/include/doSpmdnv.hh @@ -0,0 +1,522 @@ +#pragma once +#include +#include + +#include "helpers.hh" +#include "tablePrinter.hh" +#include "utilities.hh" + +#if defined CPU_ARMPL +#include "../ArmPL/spmdnv.hh" +#elif defined CPU_ONEMKL +#include "../oneMKL/CPU/spmdnv.hh" +#elif defined CPU_AOCL +#include "../AOCL/spmdnv.hh" +#elif defined CPU_NVPL +// Todo #include "../NVPL/spmdnv.hh" +#endif + +#if defined GPU_CUBLAS +#include "../cuBLAS/spmdnv.hh" +#elif defined GPU_ONEMKL +#include "../oneMKL/GPU/spmdnv.hh" +#elif defined GPU_ROCBLAS +#include "../rocBLAS/spmdnv.hh" +#endif + +/** `T` represents the type of kernel that will be run - i.e. T=float is for + * SSpMDnV. */ +template +class doSpmdnv { +public: + doSpmdnv(const std::string csvDir, const int iters, const int startDim, + const int upperLimit, const int step, const double sparsity, const matrixType type, + const bool cpuEnabled =true, const bool gpuEnabled = true) + : CSV_DIR(csvDir), + iterations_(iters), + startDimention_(startDim), + upperLimit_(upperLimit), + step_(step), + sparsity_(sparsity), + type_(type), + doCPU_(cpuEnabled), + doGPU_(gpuEnabled) +#if CPU_ENABLED + , + cpu_(iterations_) +#endif +#if GPU_ENABLED + , + gpu_(iterations_) +#endif + { + static_assert((std::is_same_v || std::is_same_v) && + "ERROR - doSpMDnV can only be constructed using one of the " + "following types: [float, double]."); + } + + /** Run all problem types and write data to CSV files. */ + void collectData() { + // Square Problem Sizes... + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + std::ofstream csvFile = + initCSVFile(CSV_DIR + "/" + getKernelName() + "_square_vector_M=N.csv"); + for (int dim = startDimention_; dim <= upperLimit_; dim += step_) { + // M = dim, N = dim; + callKernels(csvFile, dim, dim); + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Square x Vector (M=N)"); + } +#endif + + // Rectangular Problem Sizes: + // Tall and thin x Vector + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_tall-thin_vector_M=16N.csv"); + int N = startDimention_; + int M = 16 * N; + while (M <= upperLimit_) { + callKernels(csvFile, M, N); + M += 16 * step_; + N += step_; + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Tall-and-Thin x Vector (M=16N)"); + } +#endif + + // Tall and thin x Vector + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_tall-thin_vector_M_N=32.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim += step_) { + // M = dim, N = 32; + callKernels(csvFile, dim, 32); + } + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Tall-and-Thin x Vector (M, N=32)"); + } +#endif + + // Short and wide x Vector + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_short-wide_vector_N=16M.csv"); + M = startDimention_; + N = 16 * M; + while (N <= upperLimit_) { + callKernels(csvFile, M, N); + M += step_; + N += 16 * step_; + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Short-and-Wide x Vector (N=16M)"); + } +#endif + + // Short and wide x Vector + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_short-wide_vector_M=32_N.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim += step_) { + // M = 32, N = dim; + callKernels(csvFile, 32, dim); + } + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Short-and-Wide x Vector (M=32, N)"); + } +#endif + } + +private: + /** Call the appropriate CPU and GPU SPGEMV kernels. */ + void callKernels(std::ofstream& csvFile, const int M, const int N) { + const double probSize = calcKib(M, N, sparsity_); + const uint64_t flops = calcFlops(M, N, sparsity_); + std::string kernelName = getKernelName(); + +// Perform CPU kernel +#if CPU_ENABLED + time_checksum_gflop cpuResult; + if (doCPU_) { + cpu_.initialise(M, N, sparsity_, type_); + cpuResult = cpu_.compute(); + cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); + // Write result to CSV file + writeLineToCsv(csvFile, "cpu", kernelName, M, N, 0, probSize, sparsity_, + iterations_, cpuResult.runtime, cpuResult.gflops); + } +#endif + +// Perform the GPU kernels +#if GPU_ENABLED + time_checksum_gflop gpuResult_always; + time_checksum_gflop gpuResult_once; + time_checksum_gflop gpuResult_unified; + /* + * We run three different offload types: + * - ALWAYS: Offload to/from GPU every iteration + * - ONCE : Offload to/from GPU once before all iterations and once after + * - UNIFIED : data passed from host to device (and device to host) as needed + * THE ORDER OF THESE IS IMPORTANT -- To reduce time spent generating matrices, we + * generate once during the ALWAYS offload, and then re-use the same matrices for + * the ONCE and UNIFIED offload tests. Deleting them after UNIFIED. Therefore, + * changing the order here will require this logic within the spmm GPU classes to + * be updated. + */ + if (doGPU_) { + // - ALWAYS: Offload to/from GPU every iteration + gpu_.initialise(gpuOffloadType::always, M, N, sparsity_, type_); + gpuResult_always = gpu_.compute(); + gpuResult_always.gflops = + calcGflops(flops, iterations_, gpuResult_always.runtime); + writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, M, N, 0, + probSize, sparsity_, iterations_, gpuResult_always.runtime, + gpuResult_always.gflops); + + // - ONCE : Offload to/from GPU once before all iterations and once + // after + gpu_.initialise(gpuOffloadType::once, M, N, sparsity_, type_); + gpuResult_once = gpu_.compute(); + gpuResult_once.gflops = + calcGflops(flops, iterations_, gpuResult_once.runtime); + writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, M, N, 0, probSize, + sparsity_, iterations_, gpuResult_once.runtime, + gpuResult_once.gflops); + + // - UNIFIED : data passed from host to device (and device to host) as + // needed + gpu_.initialise(gpuOffloadType::unified, M, N, sparsity_, type_); + gpuResult_unified = gpu_.compute(); + gpuResult_unified.gflops = + calcGflops(flops, iterations_, gpuResult_unified.runtime); + writeLineToCsv(csvFile, "gpu_unified", kernelName, M, N, 0, probSize, + sparsity_, iterations_, gpuResult_unified.runtime, + gpuResult_unified.gflops); + } +#endif + +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Make sure all checksums match if CPU and GPU kernels are run. + // - The majority of BLAS Libraries guarentee the same result if a + // function + // is called multiple times. Given all input matrices are identical for + // each GPU offload type, we need only to compare the CPU and GPU + // checksums. + checkChecksums(cpuResult, gpuResult_once, gpuResult_always, + gpuResult_unified, M, N); + + // Check if offload structs should be reset + checkOffloadStructReset(cpuResult, gpuResult_once, gpuResult_always, + gpuResult_unified); + + // Check if offload threshold has been achieved for each GPU offload type. + updateOffloadStructs(cpuResult, gpuResult_once, gpuResult_always, + gpuResult_unified, M, N, probSize); + + // Update previous results + prev_gpuResult_once = gpuResult_once; + prev_gpuResult_always = gpuResult_always; + prev_gpuResult_unified = gpuResult_unified; + } +#endif + } + + /** Todo -- find a sensible way to do this for sparse */ + void checkChecksums(time_checksum_gflop cpuResult, + time_checksum_gflop gpuResult_once, + time_checksum_gflop gpuResult_always, + time_checksum_gflop gpuResult_unified, const int M, + const int N) { + // Ensure that each checksum difference is less than 0.1% + double hundredOverChecksum = 100 / std::fabs(cpuResult.checksum); + if (((std::fabs(cpuResult.checksum - gpuResult_once.checksum) * hundredOverChecksum)) > 0.1 && + ((std::fabs(cpuResult.checksum - gpuResult_always.checksum) * hundredOverChecksum)) > 0.1 && + ((std::fabs(cpuResult.checksum - gpuResult_unified.checksum) * hundredOverChecksum)) > 0.1) { + std::cerr << "ERROR - " << getKernelName() << " kernel checksums do not match:\n\tInput " + "dimensions: M=" << M << ", N=" << N << std::endl; + std::cerr << std::setprecision(10) << "\tCPU Checksum = " << cpuResult.checksum << std::endl; + std::cerr << std::setprecision(10) << "\tGPU (Once) Checksum = " << gpuResult_once.checksum << std::endl; + std::cerr << std::setprecision(10) << "\tGPU (Always) Checksum = " << gpuResult_always.checksum << std::endl; + std::cerr << std::setprecision(10) << "\tGPU (Unified) Checksum = " << gpuResult_unified.checksum << std::endl; + exit(1); + } + } + + + /** Check whether the offload structures need to be reset; and doing so if + * required. + * - If CPU.gflops >= GPU.gflops for last two problem sizes, then reset + * offload structures as GPU may not necessarily have reached the offload + * threshold. + */ + void checkOffloadStructReset(time_checksum_gflop cpuResult, + time_checksum_gflop gpuResult_once, + time_checksum_gflop gpuResult_always, + time_checksum_gflop gpuResult_unified) { + if ((cpuGpu_once_.M != 0) && (cpuResult.gflops >= gpuResult_once.gflops) && + (cpuResult.gflops >= prev_gpuResult_once.gflops)) { + cpuGpu_once_.cpuGflops = 0.0; + cpuGpu_once_.gpuGflops = 0.0; + cpuGpu_once_.probSize_kib = 0.0; + cpuGpu_once_.M = 0; + cpuGpu_once_.N = 0; + } + if ((cpuGpu_always_.M != 0) && + (cpuResult.gflops >= gpuResult_always.gflops) && + (cpuResult.gflops >= prev_gpuResult_always.gflops)) { + cpuGpu_always_.cpuGflops = 0.0; + cpuGpu_always_.gpuGflops = 0.0; + cpuGpu_always_.probSize_kib = 0.0; + cpuGpu_always_.M = 0; + cpuGpu_always_.N = 0; + } + if ((cpuGpu_unified_.M != 0) && + (cpuResult.gflops >= gpuResult_unified.gflops) && + (cpuResult.gflops >= prev_gpuResult_unified.gflops)) { + cpuGpu_unified_.cpuGflops = 0.0; + cpuGpu_unified_.gpuGflops = 0.0; + cpuGpu_unified_.probSize_kib = 0.0; + cpuGpu_unified_.M = 0; + cpuGpu_unified_.N = 0; + } + } + + /** Update the offload threshold structs if GPU.gflops > CPU.gflops. */ + void updateOffloadStructs(time_checksum_gflop cpuResult, + time_checksum_gflop gpuResult_once, + time_checksum_gflop gpuResult_always, + time_checksum_gflop gpuResult_unified, const int M, + const int N, const double probSize) { + if ((cpuGpu_once_.M == 0) && cpuResult.gflops < gpuResult_once.gflops) { + cpuGpu_once_.cpuGflops = cpuResult.gflops; + cpuGpu_once_.gpuGflops = gpuResult_once.gflops; + cpuGpu_once_.probSize_kib = probSize; + cpuGpu_once_.M = M; + cpuGpu_once_.N = N; + } + if ((cpuGpu_always_.M == 0) && cpuResult.gflops < gpuResult_always.gflops) { + cpuGpu_always_.cpuGflops = cpuResult.gflops; + cpuGpu_always_.gpuGflops = gpuResult_always.gflops; + cpuGpu_always_.probSize_kib = probSize; + cpuGpu_always_.M = M; + cpuGpu_always_.N = N; + } + if ((cpuGpu_unified_.M == 0) && + cpuResult.gflops < gpuResult_unified.gflops) { + cpuGpu_unified_.cpuGflops = cpuResult.gflops; + cpuGpu_unified_.gpuGflops = gpuResult_unified.gflops; + cpuGpu_unified_.probSize_kib = probSize; + cpuGpu_unified_.M = M; + cpuGpu_unified_.N = N; + } + } + + /** Todo -- work out how tis can be determined for a sparse problem with + * an unknown algorithm + * A function for calculating FLOPs performed by a GEMV. + * y = alpha*Ax + beta*y */ + constexpr uint64_t calcFlops(const int M, const int N, const double SPARSITY) const { + // There are two flops per non-zero element in the sparse matrix + uint64_t NNZ = 1 + (uint64_t)((double)M * (double)N * (1.0 - SPARSITY)); + return 2 * NNZ; + } + + /** A function for calculating the total GEMV problem size in KiB. */ + constexpr double calcKib(const int M, const int N, const double SPARSITY) const { + // Needs a CSR format matrix (one array of ints size m + 1 (row pointers), one array of ints size nnz (column indices), and one array of fps of size nnz (values)) + // Also needs two vectors x and y, of sizes n and m, respectively + uint64_t NNZ = 1 + (uint64_t)((double)M * (double)N * (1.0 - SPARSITY)); + uint64_t intSize = (M + 1) + NNZ; + uint64_t fpSize = NNZ + N + M; + return (((double)(fpSize * (sizeof(T))) + (double)(intSize * sizeof(int64_t)))/ 1024); + } + + /** Get the name of the kernel being run. */ + std::string getKernelName() const { + switch (sizeof(T)) { + case 4: + return "sspmdnv"; + case 8: + return "dspmdnv"; + default: + return "unknown"; + } + } + + /** Print to stdout the offload thresholds. */ + void printOffloadThreshold(std::string problemName) const { + std::vector header = { + "Device", "M", "N", "Total Prob. Size (KiB)", "GFLOP/s", "CPU GFLOP/s"}; + + std::vector> rows; + // Initialise GPU_Once row + std::stringstream probSize_o; + std::stringstream gpuGflops_o; + std::stringstream cpuGflops_o; + probSize_o << std::fixed << std::setprecision(2) << cpuGpu_once_.probSize_kib; + gpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.gpuGflops; + cpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.cpuGflops; + if (cpuGpu_once_.M == 0) { + // No offload threshold found + rows.push_back({"GPU (Offload Once)", std::to_string(0), + std::to_string(0), probSize_o.str(), "N/A", "N/A"}); + } else { + rows.push_back({"GPU (Offload Once)", std::to_string(cpuGpu_once_.M), + std::to_string(cpuGpu_once_.N), probSize_o.str(), + gpuGflops_o.str(), cpuGflops_o.str()}); + } + + // Initialise GPU_always row + std::stringstream probSize_a; + std::stringstream gpuGflops_a; + std::stringstream cpuGflops_a; + probSize_a << std::fixed << std::setprecision(2) << cpuGpu_always_.probSize_kib; + gpuGflops_a << std::fixed << std::setprecision(2) << cpuGpu_always_.gpuGflops; + cpuGflops_a << std::fixed << std::setprecision(2) << cpuGpu_always_.cpuGflops; + if (cpuGpu_always_.M == 0) { + // No offload threshold found + rows.push_back({"GPU (Offload Always)", std::to_string(0), + std::to_string(0), probSize_a.str(), "N/A", "N/A"}); + } else { + rows.push_back({"GPU (Offload Always)", std::to_string(cpuGpu_always_.M), + std::to_string(cpuGpu_always_.N), probSize_a.str(), + gpuGflops_a.str(), cpuGflops_a.str()}); + } + + // Initialise GPU_unified row + std::stringstream probSize_u; + std::stringstream gpuGflops_u; + std::stringstream cpuGflops_u; + probSize_u << std::fixed << std::setprecision(2) << cpuGpu_unified_.probSize_kib; + gpuGflops_u << std::fixed << std::setprecision(2) << cpuGpu_unified_.gpuGflops; + cpuGflops_u << std::fixed << std::setprecision(2) << cpuGpu_unified_.cpuGflops; + if (cpuGpu_unified_.M == 0) { + // No offload threshold found + rows.push_back({"GPU (Unified Memory)", std::to_string(0), + std::to_string(0), probSize_u.str(), "N/A", "N/A"}); + } else { + rows.push_back({"GPU (Unified Memory)", std::to_string(cpuGpu_unified_.M), + std::to_string(cpuGpu_unified_.N), probSize_u.str(), + gpuGflops_u.str(), cpuGflops_u.str()}); + } + + // Print table + tablePrinter tPrinter( + problemName + " Problem Domian GPU Offload Thresholds:", header, rows); + tPrinter.print(1); + } + + /** The output directory where CSV files should be saved to. */ + const std::string CSV_DIR; + + /** The number of iterations to perform per problem size. */ + const int iterations_; + + /** The value of the first probelm size dimention run. */ + const int startDimention_; + + /** The maximum value of the largest problem size dimention. */ + const int upperLimit_; + + /** The step size between each problem size dimension. */ + const int step_; + + /** The sparsity value of the sparse matrix. */ + const double sparsity_; + + const matrixType type_; + + /** Whether the CPU kernels should be run. */ + const bool doCPU_ = true; + + /** Whether the GPU kernels should be run. */ + const bool doGPU_ = true; + +#if CPU_ENABLED + /** The SpMDnV CPU kernel. */ + cpu::spmdnv_cpu cpu_; +#endif + +#if GPU_ENABLED + /** The SpMDnV GPU kernel. */ + gpu::spmdnv_gpu gpu_; +#endif + + /** The point at which offloading to GPU (offload once) becomes worthwhile. */ + cpuGpu_offloadThreshold cpuGpu_once_; + + /** The point at which offloading to GPU (offload always) becomes worthwhile. + */ + cpuGpu_offloadThreshold cpuGpu_always_; + + /** The point at which offloading to GPU (unified memory) becomes worthwhile. + */ + cpuGpu_offloadThreshold cpuGpu_unified_; + + /** The previous problem size's GPU (offload once) performance results. */ + time_checksum_gflop prev_gpuResult_once; + + /** The previous problem size's GPU (offload always) performance results. */ + time_checksum_gflop prev_gpuResult_always; + + /** The previous problem size's GPU (unified memory) performance results. */ + time_checksum_gflop prev_gpuResult_unified; +}; \ No newline at end of file diff --git a/include/doSpmspm.hh b/include/doSpmspm.hh new file mode 100644 index 0000000..731c449 --- /dev/null +++ b/include/doSpmspm.hh @@ -0,0 +1,650 @@ +#pragma once +#include +#include +#include + +#include "helpers.hh" +#include "tablePrinter.hh" +#include "utilities.hh" + +#if defined CPU_ARMPL +#include "../ArmPL/spmspm.hh" +#elif defined CPU_ONEMKL +#include "../oneMKL/CPU/spmspm.hh" +#elif defined CPU_AOCL +#include "../AOCL/spmspm.hh" +#endif + +#if defined GPU_CUBLAS +#include "../cuBLAS/spmspm.hh" +#elif defined GPU_ONEMKL +#include "../oneMKL/GPU/spmspm.hh" +#elif defined GPU_ROCBLAS +#include "../rocBLAS/spmspm.hh" +#endif + +/** `T` represents the type of kernel that will be run - i.e. T=float is for + * SGEMM. */ +template +class doSpmspm { +public: + doSpmspm(const std::string csvDir, const int iters, const int startDim, + const int upperLimit, const int step, const double sparsity, const matrixType type, + const bool cpuEnabled = true, const bool gpuEnabled = true) + : CSV_DIR(csvDir), + iterations_(iters), + startDimention_(startDim), + upperLimit_(upperLimit), + step_(step), + sparsity_(sparsity), + type_(type), + doCPU_(cpuEnabled), + doGPU_(gpuEnabled) +#if CPU_ENABLED + , + cpu_(iterations_) +#endif +#if GPU_ENABLED + , + gpu_(iterations_) +#endif + { + static_assert((std::is_same_v || std::is_same_v) && + "ERROR - doSpmspm can only be constructed using one of the " + "following types: [float, double]."); + } + + /** Run all problem types and write data to CSV files. */ + void collectData() { + // Square Problem Sizes... + // Re-initialise offload threshold structures + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + std::ofstream csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_square_square_M=N=K.csv"); + for (int dim = startDimention_; dim <= upperLimit_; dim += step_) { + // M = dim, N = dim, K = dim; + callKernels(csvFile, dim, dim, dim); + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Square x Square (M=N=K)"); + } +#endif + + // Rectangular Problem Sizes: + // Tall and thin x Short and wide + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_tall-thin_short-wide_M=N_M=16K.csv"); + int K = startDimention_; + int M = 16 * K; + int N = 16 * K; + while (M <= upperLimit_) { + callKernels(csvFile, M, N, K); + M += 16 * step_; + N += 16 * step_; + K += step_; + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, M=16K)"); + } +#endif + + // Tall and thin x Short and wide + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_tall-thin_short-wide_M=N_K=32.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim += step_) { + // M = dim, N = dim, K = 32; + callKernels(csvFile, dim, dim, 32); + } + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Tall-and-Thin x Short-and-Wide (M=N, K=32)"); + } +#endif + + // Short and wide x Tall and thin + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_short-wide_tall-thin_M=N_K=16M.csv"); + M = startDimention_; + N = startDimention_; + K = 16 * M; + while (K <= upperLimit_) { + callKernels(csvFile, M, N, K); + M += step_; + N += step_; + K += 16 * step_; + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N, K=16M)"); + } +#endif + + // Short and wide x Tall and thin + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_short-wide_tall-thin_M=N=32_K.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim += step_) { + // M = 32, N = 32, K = dim; + callKernels(csvFile, 32, 32, dim); + } + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Short-and-Wide x Tall-and-Thin (M=N=32, K)"); + } +#endif + + // Tall and Thin x Square + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_tall-thin_square_K=N_M=16K.csv"); + K = startDimention_; + N = startDimention_; + M = 16 * K; + while (M <= upperLimit_) { + callKernels(csvFile, M, N, K); + M += 16 * step_; + N += step_; + K += step_; + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Tall-and-Thin x Square (K=N, M=16K)"); + } +#endif + + // Tall and Thin x Square + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_tall-thin_square_K=N=32_M.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim += step_) { + // M = dim, N = 32, K = 32; + callKernels(csvFile, dim, 32, 32); + } + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Tall-and-Thin x Square (M, K=N=32)"); + } +#endif + + // Square x Short and Wide + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_square_short-wide_M=K_N=16K.csv"); + M = startDimention_; + K = startDimention_; + N = 16 * K; + while (N <= upperLimit_) { + callKernels(csvFile, M, N, K); + M += step_; + N += 16 * step_; + K += step_; + } + // Close file + csvFile.close(); +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Square x Short-and-Wide (M=K, N=16K)"); + } +#endif + // Square x Short and Wide + // Re-initialise offload threshold structures & previous results + cpuGpu_always_ = cpuGpu_offloadThreshold(); + cpuGpu_once_ = cpuGpu_offloadThreshold(); + cpuGpu_unified_ = cpuGpu_offloadThreshold(); + prev_gpuResult_always = time_checksum_gflop(); + prev_gpuResult_once = time_checksum_gflop(); + prev_gpuResult_unified = time_checksum_gflop(); + csvFile = initCSVFile(CSV_DIR + "/" + getKernelName() + + "_square_short-wide_M=K=32_N.csv"); + if (upperLimit_ >= 32) { + for (int dim = startDimention_; dim <= upperLimit_; dim += step_) { + // M = 32, N = dim, K = 32; + callKernels(csvFile, 32, dim, 32); + } + } +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Print offload results to stdout + printOffloadThreshold("Square x Short-and-Wide (M=K=32, N)"); + } +#endif + // Close file + csvFile.close(); + } + +private: + /** Ensure all CPU and GPU checksums are within the permitted limit of + * eachother. */ + void checkChecksums(time_checksum_gflop cpuResult, + time_checksum_gflop gpuResult_once, + time_checksum_gflop gpuResult_always, + time_checksum_gflop gpuResult_unified, const int M, + const int N, const int K) { + // Ensure that each checksum difference is less than 0.1% + double hundredOverChecksum = 100 / std::fabs(cpuResult.checksum); + if (((std::fabs(cpuResult.checksum - gpuResult_once.checksum) * + hundredOverChecksum)) > 0.1 && + ((std::fabs(cpuResult.checksum - gpuResult_always.checksum) * + hundredOverChecksum)) > 0.1 && + ((std::fabs(cpuResult.checksum - gpuResult_unified.checksum) * + hundredOverChecksum)) > 0.1) { + std::cerr << "ERROR - " << getKernelName() << " kernel checksums do not match:\n\tInput " + "dimensions: M=" << M << ", N=" << N << ", K=" << K << std::endl; + std::cerr << std::setprecision(10) << "\tCPU Checksum = " << cpuResult.checksum << std::endl; + std::cerr << std::setprecision(10) << "\tGPU (Once) Checksum = " << gpuResult_once.checksum << std::endl; + std::cerr << std::setprecision(10) << "\tGPU (Always) Checksum = " << gpuResult_always.checksum << std::endl; + std::cerr << std::setprecision(10) << "\tGPU (Unified) Checksum = " << gpuResult_unified.checksum << std::endl; + exit(1); + } + } + + /** Check whether the offload structures need to be reset; and doing so if + * required. + * - If CPU.gflops >= GPU.gflops for last two problem sizes, then reset + * offload structures as GPU may not necessarily have reached the offload + * threshold. */ + void checkOffloadStructReset(time_checksum_gflop cpuResult, + time_checksum_gflop gpuResult_once, + time_checksum_gflop gpuResult_always, + time_checksum_gflop gpuResult_unified) { + if ((cpuGpu_once_.M != 0) && (cpuResult.gflops >= gpuResult_once.gflops) && + (cpuResult.gflops >= prev_gpuResult_once.gflops)) { + cpuGpu_once_.cpuGflops = 0.0; + cpuGpu_once_.gpuGflops = 0.0; + cpuGpu_once_.probSize_kib = 0.0; + cpuGpu_once_.M = 0; + cpuGpu_once_.N = 0; + cpuGpu_once_.K = 0; + } + if ((cpuGpu_always_.M != 0) && + (cpuResult.gflops >= gpuResult_always.gflops) && + (cpuResult.gflops >= prev_gpuResult_always.gflops)) { + cpuGpu_always_.cpuGflops = 0.0; + cpuGpu_always_.gpuGflops = 0.0; + cpuGpu_always_.probSize_kib = 0.0; + cpuGpu_always_.M = 0; + cpuGpu_always_.N = 0; + cpuGpu_always_.K = 0; + } + if ((cpuGpu_unified_.M != 0) && + (cpuResult.gflops >= gpuResult_unified.gflops) && + (cpuResult.gflops >= prev_gpuResult_unified.gflops)) { + cpuGpu_unified_.cpuGflops = 0.0; + cpuGpu_unified_.gpuGflops = 0.0; + cpuGpu_unified_.probSize_kib = 0.0; + cpuGpu_unified_.M = 0; + cpuGpu_unified_.N = 0; + cpuGpu_unified_.K = 0; + } + } + + /** Update the offload threshold structs if GPU.gflops > CPU.gflops. */ + void updateOffloadStructs(time_checksum_gflop cpuResult, + time_checksum_gflop gpuResult_once, + time_checksum_gflop gpuResult_always, + time_checksum_gflop gpuResult_unified, const int M, + const int N, const int K, const double probSize) { + if ((cpuGpu_once_.M == 0) && cpuResult.gflops < gpuResult_once.gflops) { + cpuGpu_once_.cpuGflops = cpuResult.gflops; + cpuGpu_once_.gpuGflops = gpuResult_once.gflops; + cpuGpu_once_.probSize_kib = probSize; + cpuGpu_once_.M = M; + cpuGpu_once_.N = N; + cpuGpu_once_.K = K; + } + if ((cpuGpu_always_.M == 0) && cpuResult.gflops < gpuResult_always.gflops) { + cpuGpu_always_.cpuGflops = cpuResult.gflops; + cpuGpu_always_.gpuGflops = gpuResult_always.gflops; + cpuGpu_always_.probSize_kib = probSize; + cpuGpu_always_.M = M; + cpuGpu_always_.N = N; + cpuGpu_always_.K = K; + } + if ((cpuGpu_unified_.M == 0) && + cpuResult.gflops < gpuResult_unified.gflops) { + cpuGpu_unified_.cpuGflops = cpuResult.gflops; + cpuGpu_unified_.gpuGflops = gpuResult_unified.gflops; + cpuGpu_unified_.probSize_kib = probSize; + cpuGpu_unified_.M = M; + cpuGpu_unified_.N = N; + cpuGpu_unified_.K = K; + } + } + + void callKernels(std::ofstream& csvFile, const int N, const int M, + const int K) { + const double probSize = calcKib(N, N, N, sparsity_); + const uint64_t flops = calcFlops(N, N, N, sparsity_); + std::string kernelName = getKernelName(); + +#if CPU_ENABLED + time_checksum_gflop cpuResult; + if (doCPU_) { + cpu_.initialise(N, M, K, sparsity_, type_); + cpuResult = cpu_.compute(); + cpuResult.gflops = calcGflops(flops, iterations_, cpuResult.runtime); + writeLineToCsv(csvFile, "cpu", kernelName, N, M, K, probSize, + sparsity_, iterations_, cpuResult.runtime, + cpuResult.gflops); + } +#endif +#if GPU_ENABLED + // Perform the GPU kernels + time_checksum_gflop gpuResult_always; + time_checksum_gflop gpuResult_once; + time_checksum_gflop gpuResult_unified; + /* + * We run three different offload types: + * - ALWAYS: Offload to/from GPU every iteration + * - ONCE : Offload to/from GPU once before all iterations and once after + * - UNIFIED : data passed from host to device (and device to host) as needed + * THE ORDER OF THESE IS IMPORTANT -- To reduce time spent generating matrices, we + * generate once during the ALWAYS offload, and then re-use the same matrices for + * the ONCE and UNIFIED offload tests. Deleting them after UNIFIED. Therefore, + * changing the order here will require this logic within the spmspm GPU classes to + * be updated. + */ + if (doGPU_) { + // - ALWAYS: Offload to/from GPU every iteration + gpu_.initialise(gpuOffloadType::always, N, M, K, sparsity_, type_); + gpuResult_always = gpu_.compute(); + gpuResult_always.gflops = + calcGflops(flops, iterations_, gpuResult_always.runtime); + writeLineToCsv(csvFile, "gpu_offloadAlways", kernelName, N, M, K, + probSize, sparsity_, iterations_, gpuResult_always.runtime, + gpuResult_always.gflops); + + // - ONCE : Offload to/from GPU once before all iterations and once + // after + gpu_.initialise(gpuOffloadType::once, N, M, K, sparsity_, type_); + gpuResult_once = gpu_.compute(); + gpuResult_once.gflops = + calcGflops(flops, iterations_, gpuResult_once.runtime); + writeLineToCsv(csvFile, "gpu_offloadOnce", kernelName, N, M, K, probSize, + sparsity_, iterations_, gpuResult_once.runtime, + gpuResult_once.gflops); + + // - UNIFIED : data passed from host to device (and device to host) as + // needed + gpu_.initialise(gpuOffloadType::unified, N, M, K, sparsity_, type_); + gpuResult_unified = gpu_.compute(); + gpuResult_unified.gflops = + calcGflops(flops, iterations_, gpuResult_unified.runtime); + writeLineToCsv(csvFile, "gpu_unified", kernelName, N, M, K, probSize, + sparsity_, iterations_, gpuResult_unified.runtime, + gpuResult_unified.gflops); + } +#endif +#if CPU_ENABLED && GPU_ENABLED + if (doCPU_ && doGPU_) { + // Check that all checksums are within the permitted limit + checkChecksums(cpuResult, gpuResult_once, gpuResult_always, + gpuResult_unified, N, M, K); + // Check whether offload structs need to be reset + checkOffloadStructReset(cpuResult, gpuResult_once, gpuResult_always, + gpuResult_unified); + // Update offload structs if required + updateOffloadStructs(cpuResult, gpuResult_once, gpuResult_always, + gpuResult_unified, N, M, K, probSize); + // Update previous GPU results + prev_gpuResult_once = gpuResult_once; + prev_gpuResult_always = gpuResult_always; + prev_gpuResult_unified = gpuResult_unified; + } +#endif + } + + /** A function for calculating FLOPs performed by a GEMM. + * C = alpha*AB + beta*C */ + constexpr uint64_t calcFlops(const int M, const int N, const int K, const double SPARSITY) const { + // The number of scalar multiplications is nnz(Ak)*nnz(Bk) for each inner index k + // Therefore, the expectation is to have NNZA * NNZB / K, as each K index would + // on average have NNZA/K * NNZB/K. This assumes a uniform distribution of non-zero elements + uint64_t NNZA = 1 + (uint64_t)((double)M * (double)K * (1.0 - SPARSITY)); + uint64_t NNZB = 1 + (uint64_t)((double)K * (double)N * (1.0 - SPARSITY)); + return (NNZA * NNZB) / K; + } + + /** A function for calculating the total GEMM problem size in KiB. + Each matrix is stored in CSR, and so needs (nRows + 1) + 2NNZ space. + For A and B, this is easy, but for C we do not know its size ahead of time + (we know nRows but not NNZ). However, we can estimate the NNZ, on average. + + Each value of C is the sum of the products of the corresponding row of A + and column of B. + As each value of A and B has a probability of (1 - SPARSITY) if being non-zero, + the probability that both A and B are non-zero (and thus that the product is + non-zero)is (1 - SPARSITY)^2. + There are K products that are summed together. If any one of these products is + non-zero, so too shall the sum be. Therefore, the estimated sparsity of C is + (1 - (1 - SPARSITY)^2)^K + */ + constexpr double calcKib(const int M, const int N, const int K, const double SPARSITY) const { + uint64_t M_ = (uint64_t)M, K_ = (uint64_t)K; + uint64_t NNZA = 1 + (uint64_t)((double)M * (double)K * (1.0 - SPARSITY)); + uint64_t NNZB = 1 + (uint64_t)((double)K * (double)N * (1.0 - SPARSITY)); + double CSPARSITY = 1 - pow(pow(1.0 - SPARSITY, 2), K); + uint64_t NNZC = 1 + (uint64_t)((double)M * (double)N * CSPARSITY); + + uint64_t probSize = (M_ + 1) + (2 * NNZA) + (K_ + 1) + (2 * NNZB) + (M_ + 1) + (2 * NNZC); + return ((double)(probSize * (sizeof(T))) / 1024); + } + + /** Get the name of the kernel being run. */ + std::string getKernelName() const { + switch (sizeof(T)) { + case 4: + return "sspmspm"; + case 8: + return "dspmspm"; + default: + return "unknown"; + } + } + + /** Print to stdout the offload thresholds. */ + void printOffloadThreshold(const std::string& problemName) const { + std::vector header = { + "Device", "M", "N", "K", "Total Prob. Size (KiB)", + "GFLOP/s", "CPU GFLOP/s"}; + + std::vector> rows; + // Initialise GPU_Once row + std::stringstream probSize_o; + std::stringstream gpuGflops_o; + std::stringstream cpuGflops_o; + probSize_o << std::fixed << std::setprecision(2) << cpuGpu_once_.probSize_kib; + gpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.gpuGflops; + cpuGflops_o << std::fixed << std::setprecision(2) << cpuGpu_once_.cpuGflops; + if (cpuGpu_once_.M == 0) { + // No offload threshold found + rows.push_back({"GPU (Offload Once)", std::to_string(0), + std::to_string(0), std::to_string(0), probSize_o.str(), + "N/A", "N/A"}); + } else { + rows.push_back({"GPU (Offload Once)", std::to_string(cpuGpu_once_.M), + std::to_string(cpuGpu_once_.N), + std::to_string(cpuGpu_once_.K), probSize_o.str(), + gpuGflops_o.str(), cpuGflops_o.str()}); + } + + // Initialise GPU_always row + std::stringstream probSize_a; + std::stringstream gpuGflops_a; + std::stringstream cpuGflops_a; + probSize_a << std::fixed << std::setprecision(2) << cpuGpu_always_.probSize_kib; + gpuGflops_a << std::fixed << std::setprecision(2) << cpuGpu_always_.gpuGflops; + cpuGflops_a << std::fixed << std::setprecision(2) << cpuGpu_always_.cpuGflops; + if (cpuGpu_always_.M == 0) { + // No offload threshold found + rows.push_back({"GPU (Offload Always)", std::to_string(0), + std::to_string(0), std::to_string(0), probSize_a.str(), + "N/A", "N/A"}); + } else { + rows.push_back({"GPU (Offload Always)", std::to_string(cpuGpu_always_.M), + std::to_string(cpuGpu_always_.N), + std::to_string(cpuGpu_always_.K), probSize_a.str(), + gpuGflops_a.str(), cpuGflops_a.str()}); + } + + // Initialise GPU_unified row + std::stringstream probSize_u; + std::stringstream gpuGflops_u; + std::stringstream cpuGflops_u; + probSize_u << std::fixed << std::setprecision(2) << cpuGpu_unified_.probSize_kib; + gpuGflops_u << std::fixed << std::setprecision(2) << cpuGpu_unified_.gpuGflops; + cpuGflops_u << std::fixed << std::setprecision(2) << cpuGpu_unified_.cpuGflops; + if (cpuGpu_unified_.M == 0) { + // No offload threshold found + rows.push_back({"GPU (Unified Memory)", std::to_string(0), + std::to_string(0), std::to_string(0), probSize_u.str(), + "N/A", "N/A"}); + } else { + rows.push_back({"GPU (Unified Memory)", std::to_string(cpuGpu_unified_.M), + std::to_string(cpuGpu_unified_.N), + std::to_string(cpuGpu_unified_.K), probSize_u.str(), + gpuGflops_u.str(), cpuGflops_u.str()}); + } + + // Print table + tablePrinter tPrinter( + problemName + " Problem Domian GPU Offload Thresholds:", header, rows); + tPrinter.print(1); + } + + /** The output directory where CSV files should be saved to. */ + const std::string CSV_DIR; + + /** The number of iterations to perform per problem size. */ + const int iterations_; + + /** The value of the first probelm size dimention run. */ + const int startDimention_; + + /** The maximum value of the largest problem size dimention. */ + const int upperLimit_; + + /** The step size between problem sizes. */ + const int step_; + + /** The sparsity value of the sparse matrices. */ + const double sparsity_; + + const matrixType type_; + + /** Whether the CPU kernels should be run. */ + const bool doCPU_ = true; + + /** Whether the GPU kernels should be run. */ + const bool doGPU_ = true; + +#if CPU_ENABLED + /** The CPU kernel. */ + cpu::spmspm_cpu cpu_; +#endif + +#if GPU_ENABLED + /** The GPU kernel. */ + gpu::spmspm_gpu gpu_; +#endif + + /** The point at which offloading to GPU (offload once) becomes worthwhile. */ + cpuGpu_offloadThreshold cpuGpu_once_; + + /** The point at which offloading to GPU (offload always) becomes worthwhile. + */ + cpuGpu_offloadThreshold cpuGpu_always_; + + /** The point at which offloading to GPU (unified memory) becomes worthwhile. + */ + cpuGpu_offloadThreshold cpuGpu_unified_; + + /** The previous problem size's GPU (offload once) performance results. */ + time_checksum_gflop prev_gpuResult_once; + + /** The previous problem size's GPU (offload always) performance results. */ + time_checksum_gflop prev_gpuResult_always; + + /** The previous problem size's GPU (unified memory) performance results. */ + time_checksum_gflop prev_gpuResult_unified; +}; \ No newline at end of file diff --git a/include/helpers.hh b/include/helpers.hh index 5618557..db8df69 100644 --- a/include/helpers.hh +++ b/include/helpers.hh @@ -17,9 +17,8 @@ std::ofstream initCSVFile(const std::string filename) { std::ofstream newFile(filename); - newFile << "Device,Kernel,M,N,K,Total Problem Size (KiB),Iterations,Total " - "Seconds,GFLOP/s" - << std::endl; + newFile << "Device,Kernel,M,N,K,Total Problem Size (KiB),sparsity,Iterations," + "Total Seconds,GFLOP/s" << std::endl; return newFile; } @@ -28,22 +27,17 @@ std::ofstream initCSVFile(const std::string filename) { * Function does not close the file. */ void writeLineToCsv(std::ofstream& file, const std::string device, const std::string kernel, const int M, const int N, - const int K, const double totalProbSize, const int iters, - const double totalTime, const double gflops) { + const int K, const double totalProbSize, const float + sparsity, const int iters, const double totalTime, + const double gflops) { if (!file.is_open()) { - std::cout << "ERROR - Attempted to write line to a closed CSV file." - << std::endl; + std::cout << "ERROR - Attempted to write line to a closed CSV file." << std::endl; exit(1); } - file << device << "," << kernel << "," << M << "," << N << "," << K << "," - << std::fixed << std::setprecision(3) << totalProbSize << "," << iters - << "," << std::fixed << std::setprecision(5) << totalTime << "," - << std::fixed << std::setprecision(3) << gflops << std::endl; + file << device << "," << kernel << "," << M << "," << N << "," << K << "," << std::fixed << std::setprecision(3) << totalProbSize << "," << std::fixed << std::setprecision(8) << sparsity << "," << iters << "," << std::fixed << std::setprecision(5) << totalTime << "," << std::fixed << std::setprecision(3) << gflops << std::endl; } /** Calculate average GFLOPs. */ double calcGflops(const uint64_t flops, const int iters, const double seconds) { - return (seconds == 0.0 || seconds == INFINITY) - ? 0.0 - : ((double)(flops * iters) / seconds) * 1e-9; + return (seconds == 0.0) ? 0.0 : ((double)(flops * iters) / seconds) * 1e-9; } \ No newline at end of file diff --git a/include/kernels/.DS_Store b/include/kernels/.DS_Store new file mode 100644 index 0000000..9cc84b2 Binary files /dev/null and b/include/kernels/.DS_Store differ diff --git a/include/kernels/CPU/gemm.hh b/include/kernels/CPU/gemm.hh index 6b4c93e..6dd4786 100644 --- a/include/kernels/CPU/gemm.hh +++ b/include/kernels/CPU/gemm.hh @@ -7,7 +7,7 @@ namespace cpu { /** An abstract class for GEMM BLAS kernels. */ template class gemm : public ::gemm { - public: +public: using ::gemm::gemm; using ::gemm::initInputMatrices; using ::gemm::m_; @@ -17,7 +17,7 @@ class gemm : public ::gemm { using ::gemm::B_; using ::gemm::C_; - public: +public: /** Initialise the required data structures. */ void initialise(int m, int n, int k) { m_ = m; @@ -32,7 +32,7 @@ class gemm : public ::gemm { initInputMatrices(); } - private: +private: /** Do any necessary cleanup (free pointers, close library handles, etc.) * after Kernel has been called. */ void postCallKernelCleanup() override { diff --git a/include/kernels/CPU/spmdnm.hh b/include/kernels/CPU/spmdnm.hh new file mode 100644 index 0000000..b383fe7 --- /dev/null +++ b/include/kernels/CPU/spmdnm.hh @@ -0,0 +1,61 @@ +#pragma once + +#include "../spmdnm.hh" + +namespace cpu { + +/** + * An abstract class for sparse matrix-dense matrix BLAS kernels + */ +template +class spmdnm : public :: spmdnm { +public: + using ::spmdnm::spmdnm; + using ::spmdnm::initInputMatrices; + using ::spmdnm::iterations_; + using ::spmdnm::nnz_; + using ::spmdnm::sparsity_; + using ::spmdnm::type_; + using ::spmdnm::m_; + using ::spmdnm::n_; + using ::spmdnm::k_; + using ::spmdnm::B_; + using ::spmdnm::C_; + +public: + /** + * Initialise the required data structures. + */ + void initialise(int m, int n, int k, double sparsity, + matrixType type, bool binary = false) { + m_ = m; + n_ = n; + k_ = k; + sparsity_ = sparsity; + type_ = type; + + nnz_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_)); + + // Allocate memory for dense matrices + B_ = (T*)calloc(k_ * n_, sizeof(T)); + C_ = (T*)calloc(m_ * n_, sizeof(T)); + + // Check for allocation failures + if (!B_ || !C_) { + std::cerr << "ERROR: Memory allocation failed in spmdnm initialization" << std::endl; + exit(1); + } + + initInputMatrices(); + } + +private: + /** Do any necessary cleanup (free pointers, close library handles, etc.) + * after Kernel has been called. */ + void postCallKernelCleanup() { + free(B_); + free(C_); + } +}; + +} \ No newline at end of file diff --git a/include/kernels/CPU/spmdnv.hh b/include/kernels/CPU/spmdnv.hh new file mode 100644 index 0000000..ab02207 --- /dev/null +++ b/include/kernels/CPU/spmdnv.hh @@ -0,0 +1,52 @@ +#pragma once + +#include "../spmdnv.hh" + +#include +#include + +namespace cpu { + +/** An abstract class for SpMDnV BLAS kernels. */ +template +class spmdnv : public ::spmdnv { +public: + using ::spmdnv::spmdnv; + using ::spmdnv::initInputMatrixVector; + using ::spmdnv::m_; + using ::spmdnv::n_; + using ::spmdnv::x_; + using ::spmdnv::y_; + using ::spmdnv::sparsity_; + using ::spmdnv::nnz_; + using ::spmdnv::type_; + +public: + /** Initialise the required data structures. */ + void initialise(int m, int n, double sparsity, matrixType type) { + m_ = m; + n_ = n; + sparsity_ = sparsity; + type_ = type; + + // Note that the below should be the same as the edges calculation + // used in the initInputMatricesSparse function. If changed here, + // change there + nnz_ = 1 + (uint64_t)((double)m_ * (double)n_ * (1.0 - sparsity_)); + + x_ = (T*)malloc(sizeof(T) * n_); + y_ = (T*)malloc(sizeof(T) * m_); + + // Initialise the matrix and vectors + initInputMatrixVector(); + } + +private: + /** Do any necessary cleanup (free pointers, close library handles, etc.) + * after Kernel has been called. */ + void postCallKernelCleanup() { + free(x_); + free(y_); + } +}; +} // namespace cpu \ No newline at end of file diff --git a/include/kernels/CPU/spmspm.hh b/include/kernels/CPU/spmspm.hh new file mode 100644 index 0000000..e0bce32 --- /dev/null +++ b/include/kernels/CPU/spmspm.hh @@ -0,0 +1,53 @@ +#pragma once + +#include "../spmspm.hh" + +#include +#include +#include + +namespace cpu { + +/** An abstract class for sparse matrix-sparse matrix BLAS kernels. */ +template +class spmspm : public ::spmspm { +public: + using ::spmspm::spmspm; + using ::spmspm::initInputMatrices; + using ::spmspm::iterations_; + using ::spmspm::A_nnz_; + using ::spmspm::B_nnz_; + using ::spmspm::sparsity_; + using ::spmspm::type_; + using ::spmspm::m_; + using ::spmspm::n_; + using ::spmspm::k_; + using ::spmspm::C_rows_; + using ::spmspm::C_cols_; + using ::spmspm::C_vals_; + using ::spmspm::C_nnz_; + +public: + /** Initialise the required data structures. */ + void initialise(int n, int m, int k, double sparsity, + matrixType type, bool binary = false) { + n_ = n; + m_ = m; + k_ = k; + + sparsity_ = sparsity; + type_ = type; + + /** Determine the number of nnz elements in A and B */ + A_nnz_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_)); + B_nnz_ = 1 + (uint64_t)((double)k_ * (double)n_ * (1.0 - sparsity_)); + + initInputMatrices(); + } + +private: + /** Do any necessary cleanup (free pointers, close library handles, etc.) + * after Kernel has been called. */ + void postCallKernelCleanup() {} +}; +} // namespace cpu diff --git a/include/kernels/GPU/spmdnm.hh b/include/kernels/GPU/spmdnm.hh new file mode 100644 index 0000000..b817280 --- /dev/null +++ b/include/kernels/GPU/spmdnm.hh @@ -0,0 +1,29 @@ +#pragma once + +#include "../spmdnm.hh" + +namespace gpu { + +/** An abstract class for sparse matrix-dense matrix BLAS kernels. */ + template + class spmdnm : public ::spmdnm { + public: + using ::spmdnm::spmdnm; + + /** Initialise the required data structures. + * `offload` refers to the data offload type: + * - Once: Move data from host to device before all iterations & move from + * device to host after all iterations + * - Always: Move data from host to device and device to host each iteration + * - Unified: Initialise data as unified memory; no data movement semantics + * required */ + virtual void initialise(gpuOffloadType offload, int m, int n, int k, + double sparsity, matrixType type, + bool binary = false) = 0; + + protected: + /** Whether data should be offloaded to/from the GPU each iteration, or just + * before & after. */ + gpuOffloadType offload_ = gpuOffloadType::always; + }; +} // namespace gpu \ No newline at end of file diff --git a/include/kernels/GPU/spmdnv.hh b/include/kernels/GPU/spmdnv.hh new file mode 100644 index 0000000..41e46ac --- /dev/null +++ b/include/kernels/GPU/spmdnv.hh @@ -0,0 +1,28 @@ +#pragma once + +#include "../spmdnv.hh" + +namespace gpu { + +/** An abstract class for GEMV BLAS kernels. */ + template + class spmdnv : public ::spmdnv { + public: + using ::spmdnv::spmdnv; + + /** Initialise the required data structures. + * `offload` refers to the data offload type: + * - Once: Move data from host to device before all iterations & move from + * device to host after all iterations + * - Always: Move data from host to device and device to host each iteration + * - Unified: Initialise data as unified memory; no data movement semantics + * required */ + virtual void initialise(gpuOffloadType offload, int m, int n, + double sparsity, matrixType type) = 0; + + protected: + /** Whether data should be offloaded to/from the GPU each iteration, or just + * before & after. */ + gpuOffloadType offload_ = gpuOffloadType::always; + }; +} // namespace gpu \ No newline at end of file diff --git a/include/kernels/GPU/spmspm.hh b/include/kernels/GPU/spmspm.hh new file mode 100644 index 0000000..e36d470 --- /dev/null +++ b/include/kernels/GPU/spmspm.hh @@ -0,0 +1,29 @@ +#pragma once + +#include "../spmspm.hh" + +namespace gpu { + +/** An abstract class for sparse matrix-sparse matrix BLAS kernels. */ +template +class spmspm : public ::spmspm { +public: + using ::spmspm::spmspm; + + /** Initialise the required data structures. + * `offload` refers to the data offload type: + * - Once: Move data from host to device before all iterations & move from + * device to host after all iterations + * - Always: Move data from host to device and device to host each iteration + * - Unified: Initialise data as unified memory; no data movement semantics + * required */ + virtual void initialise(gpuOffloadType offload, int m, int n, int k, + double sparsity, matrixType type, + bool binary = false) = 0; + +protected: + /** Whether data should be offloaded to/from the GPU each iteration, or just + * before & after. */ + gpuOffloadType offload_ = gpuOffloadType::always; +}; +} // namespace gpu \ No newline at end of file diff --git a/include/kernels/gemm.hh b/include/kernels/gemm.hh index 4eda90f..3f0aece 100644 --- a/include/kernels/gemm.hh +++ b/include/kernels/gemm.hh @@ -1,9 +1,15 @@ #pragma once +#ifdef CPU_ONEMKL +#include +#endif + #include #include #include #include +#include +#include #include "../utilities.hh" diff --git a/include/kernels/gemv.hh b/include/kernels/gemv.hh index ba12d02..a64b19c 100644 --- a/include/kernels/gemv.hh +++ b/include/kernels/gemv.hh @@ -4,6 +4,7 @@ #include #include #include +#include #include "../utilities.hh" @@ -82,6 +83,83 @@ class gemv { } } + void initInputMatrixVectorSparse() { + // Initialise sparse matrix + for (int i = 0; i < (n_ * n_); i++) { + A_[i] = 0.0; + } + + // Random number generator objects for use in descent + std::default_random_engine gen; + gen.seed(std::chrono::system_clock::now() + .time_since_epoch().count()); + std::uniform_real_distribution dist(0.0, 1.0); + + uint64_t edges = 1 + (uint64_t)((double)n_ * (double)n_ * (1.0 - + sparsity_)); + + // Using a=0.45 and b=c=0.22 as default probabilities + for (uint64_t i = 0; i < edges; i++) { + while (!rMat(A_, n_, 0, n_ - 1, 0, n_ - 1, 0.45, 0.22, 0.22, &gen, dist, + false)) {} + } + + // Initialise the input and output vectors + for (int y = 0; y < n_; y++) { + x_[y] = (T)((double)(rand() % 100) / 3.0); + } + for (int y = 0; y < m_; y++) { + y_[y] = (T)0.0; + } + } + + /** Recursive function to populate sparse matrices */ + bool rMat(T* M, int n, int x1, int x2, int y1, int y2, float a, float b, + float c, std::default_random_engine* gen, + std::uniform_real_distribution dist, bool bin) { + // If a 1x1 submatrix, then add an edge and return out + if (x1 >= x2 && y1 >= y2) { + // Needed to avoid overfloe segfaults with large problem sizes + uint64_t index = (((uint64_t)y1 * (uint64_t)n) + (uint64_t)x1); + if (abs(M[index]) > 0.1) { + return false; + } else { + // Add 1.0 if this is a binary graph, and a random real number otherwise + M[index] = (bin) ? 1.0 : (((rand() % 10000) / 100.0) - 50.0); + return true; + } + } else { + // Divide up the matrix + int xMidPoint = x1 + floor((x2 - x1) / 2); + int yMidPoint = y1 + floor((y2 - y1) / 2); + + // ToDo -- add some noise to these values between iterations + float newA = a; + float newB = b; + float newC = c; + + // Work out which quarter to recurse into + // There are some ugly ternary operators here to avoid going out of bounds in the edge case + // that we are already at 1 width or 1 height + float randomNum = dist(*gen); + if (randomNum < a) { + return rMat(M, n, x1, xMidPoint, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); + } else if (randomNum < (a + b)) { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, y1, yMidPoint, + newA, newB, newC, gen, dist, bin); + } else if (randomNum < (a + b + c)) { + return rMat(M, n, x1, xMidPoint, ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, + newA, newB, newC, gen, dist, bin); + } else { + return rMat(M, n, ((xMidPoint < x2) ? xMidPoint + 1 : xMidPoint), x2, + ((yMidPoint < y2) ? yMidPoint + 1 : yMidPoint), y2, newA, newB, newC, + gen, dist, bin); + } + } + return true; + } + /** Call the extern consume() function. */ void callConsume() { consume((void*)A_, (void*)x_, (void*)y_); } @@ -105,4 +183,6 @@ class gemv { /** The distance between two vector elements. */ const int vecIncrement_ = 1; + + double sparsity_ = 0.0; }; diff --git a/include/kernels/spmdnm.hh b/include/kernels/spmdnm.hh new file mode 100644 index 0000000..92ea1ad --- /dev/null +++ b/include/kernels/spmdnm.hh @@ -0,0 +1,117 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "../utilities.hh" + +/** +* A generic abstract class defining the operation of timing a sparse GEMM + * BLAS kernel for n iterations +*/ +template +class spmdnm { +public: + spmdnm(const int iters) : iterations_(iters) {} + + /** Call the kernel n times. Returns the time elapsed for all n calls + * in seconds */ + time_checksum_gflop compute() { + // Start the timer + std::chrono::time_point startTime = + std::chrono::high_resolution_clock::now(); + + // perform the SPMM calls + preLoopRequirements(); + for (int i = 0; i < iterations_; i++) { + callSpmdnm(); + } + postLoopRequirements(); + + // Stop the timer + std::chrono::time_point endTime = + std::chrono::high_resolution_clock::now(); + std::chrono::duration time_s = endTime - startTime; + + double checksum = calcChecksum(); + + postCallKernelCleanup(); + + return {time_s.count(), checksum, 0.0}; + } + + int64_t nnz_ = 0; + +private: + /** Performs the steps required before calling the SPMM kernel that + * should be timed */ + virtual void preLoopRequirements() = 0; + + /** Perform the sparse GEMM kernel. */ + virtual void callSpmdnm() = 0; + + /** Perform any steps required after calling the SPMM kernel that should + * be timed */ + virtual void postLoopRequirements() = 0; + + /** Do the necessary cleanup after the kernel has been finished that + * should not be timed */ + virtual void postCallKernelCleanup() = 0; + + /** Calculate a checksum from the result matrix C. */ + constexpr double calcChecksum() { + return 0; + // Checksum for GEMM calculated by summing all four corners of C together + return ((double)C_[0] + (double)C_[m_ - 1] + (double)C_[(m_ * (n_ - 1))] + + (double)C_[m_ * n_ - 1]); + } + +protected: + /** Set up the starting matrices */ + void initInputMatrices() { + // Initialize B with random values + srand(SEED); + for (int i = 0; i < (k_ * n_); i++) { + B_[i] = (T)((double)(rand() % 100) / 7.0); + } + + // Initialize C to zero + for (int i = 0; i < (m_ * n_); i++) { + C_[i] = (T)0.0; + } + + toSparseFormat(); + } + + /** Move matrices into the sparse representation of for the given library */ + virtual void toSparseFormat() = 0; + + /** Call the external consume() function on the matrices */ + void callConsume() {}/** Recursive function to populate sparse matrices */ + + /** The number of iterations to perform per problem size. */ + const int iterations_; + + /** Matrix dimension M. */ + int64_t m_ = 0; + + /** Matrix dimension N. */ + int64_t n_ = 0; + + /** Matrix dimension K. */ + int64_t k_ = 0; + + /** Dense representation of input matrix B. */ + T* B_; + + /** Dense representation of output matrix C. */ + T* C_; + + double sparsity_; + + matrixType type_; +}; \ No newline at end of file diff --git a/include/kernels/spmdnv.hh b/include/kernels/spmdnv.hh new file mode 100644 index 0000000..b887e29 --- /dev/null +++ b/include/kernels/spmdnv.hh @@ -0,0 +1,118 @@ + +#pragma once + +#include +#include +#include +#include +#include + +#include "../utilities.hh" + +/** A generic abstract class defining the operation of timing an SPGEMM BLAS + * kernel for n iterations. */ +template +class spmdnv { +public: + spmdnv(const int iters) : iterations_(iters) {} + + /** Call the BLAS kernel n times. + * Returns the time elapsed for n BLAS calls in seconds. */ + time_checksum_gflop compute() { + // Start timer + std::chrono::time_point startTime = + std::chrono::high_resolution_clock::now(); + + // Perform all SPGEMM calls + preLoopRequirements(); + for (int i = 0; i < iterations_; i++) { + callSpMDnV(); + } + postLoopRequirements(); + + // Stop Timer + std::chrono::time_point endTime = + std::chrono::high_resolution_clock::now(); + // Get time elapsed in seconds + std::chrono::duration time_s = endTime - startTime; + + double checksum = calcChecksum(); + + postCallKernelCleanup(); + + return {time_s.count(), checksum, 0.0}; + } + + int64_t nnz_ = 0; + +private: + /** Perform any required steps before calling the SpMDnV kernel that should + * be timed. */ + virtual void preLoopRequirements() = 0; + + /** Perform the SpMDnV kernel. */ + virtual void callSpMDnV() = 0; + + /** Perform any required steps after calling the SpMDnV kernel that should + * be timed. */ + virtual void postLoopRequirements() = 0; + + /** Do any necessary cleanup (free pointers, close library handles, etc.) + * after Kernel has been called. */ + virtual void postCallKernelCleanup() = 0; + + /** Calculate a checksum from the result vector y. */ + // Todo -- work out how to sensibly do this for sparse + constexpr double calcChecksum() { + // Checksum for SpMDnV calculated by summing max and min element of output + // vector + return ((double)y_[0] + (double)y_[m_ - 1]); + } + +protected: + void initInputMatrixVector() { + // Set the seed to allow checksum to work + srand(SEED); + + // Initialise the input and output vectors + for (int y = 0; y < n_; y++) { + x_[y] = (T)((double)(rand() % 100) / 3.0); + } + for (int y = 0; y < m_; y++) { + y_[y] = (T)0.0; + } + + toSparseFormat(); + } + + bool print_ = false; + + /** Move starting matrix into the sparse representation of for the given + * library */ + virtual void toSparseFormat() = 0; + + /** Call the extern consume() function. */ + void callConsume() {} + + /** The number of iterations to perform per problem size. */ + const int iterations_; + + /** Matrix dimension M. */ + int m_ = 0; + + /** Matrix / vector dimension N. */ + int n_ = 0; + + /** Input vector x. */ + T* x_; + + /** Input vector y. */ + T* y_; + + /** The distance between two vector elements. */ + const int vecIncrement_ = 1; + + double sparsity_ = 0.0; + + matrixType type_; +}; diff --git a/include/kernels/spmspm.hh b/include/kernels/spmspm.hh new file mode 100644 index 0000000..87f55b3 --- /dev/null +++ b/include/kernels/spmspm.hh @@ -0,0 +1,117 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include "../utilities.hh" + +/** A generic abstract class defining the operation of timing a SpMSpM BLAS + * kernel for n iterations */ +template +class spmspm { +public: + spmspm(const int iters) : iterations_(iters) {} + + /** Call the kernel n times. Returns the time elapsed for all n calls + * in seconds */ + time_checksum_gflop compute() { + // Start the timer + std::chrono::time_point startTime = + std::chrono::high_resolution_clock::now(); + + // perform the SpMSpM calls + preLoopRequirements(); + for (int i = 0; i < iterations_; i++) { + callSpmspm(); + } + postLoopRequirements(); + + // Stop the timer + std::chrono::time_point endTime = + std::chrono::high_resolution_clock::now(); + std::chrono::duration time_s = endTime - startTime; + + double checksum = calcChecksum(); + + postCallKernelCleanup(); + + return {time_s.count(), checksum, 0.0}; + } + +private: + /** Performs the steps required before calling the SpMSpM kernel that + * should be timed */ + virtual void preLoopRequirements() = 0; + + /** Perform the SpMSpM kernel. */ + virtual void callSpmspm() = 0; + + /** Perform any steps required after calling the SpMSpM kernel that should + * be timed */ + virtual void postLoopRequirements() = 0; + + /** Do the necessary cleanup after the kernel has been finished that + * should not be timed */ + virtual void postCallKernelCleanup() = 0; + + /** Calculate a checksum from the result matrix C. */ + constexpr double calcChecksum() { + return 0; + if (C_nnz_ == 0) { + return (double)0.0; // No non-zeros, return zero checksum + } else if (C_nnz_ == 1) { + return (double)C_vals_[0]; // Single non-zero, return its value + } else { + return (double)C_vals_[0] + (double)C_vals_[C_nnz_ - 1]; + } + } + +protected: + /** Set up the starting matrices */ + void initInputMatrices() { + toSparseFormat(); + } + + /** Move matrices into the sparse representation of for the given library */ + virtual void toSparseFormat() = 0; + + /** Call the external consume() function on the matrices */ + void callConsume() { consume((void*)A_, (void*)B_, (void*)C_); }/** Recursive function to populate sparse matrices */ + + /** The number of iterations to perform per problem size. */ + const int iterations_; + + /** Matrix dimension M. */ + int m_ = 0; + + /** Matrix dimension N. */ + int n_ = 0; + + /** Matrix dimension K. */ + int k_ = 0; + + /** Dense representation of input matrix A. */ + T* A_; + + /** Dense representation of input matrix B. */ + T* B_; + + /** Dense representation of output matrix C. */ + T* C_; + + /** CSR representation of output matrix C. */ + int64_t C_nnz_; + int64_t* C_rows_; + int64_t* C_cols_; + T* C_vals_; + + int64_t A_nnz_ = 0; + int64_t B_nnz_ = 0; + + double sparsity_; + + matrixType type_; +}; \ No newline at end of file diff --git a/include/main.hh b/include/main.hh index cc0bb8f..37a8d9a 100644 --- a/include/main.hh +++ b/include/main.hh @@ -5,7 +5,10 @@ #include #include "doGemm.hh" +#include "doSpmdnm.hh" +#include "doSpmspm.hh" #include "doGemv.hh" +#include "doSpmdnv.hh" #include "utilities.hh" /** A function which prints standard configuration information to stdout. */ @@ -14,5 +17,5 @@ void printBenchmarkConfig(const int iters, const int upperLimit); /** A function to parse a string to integer. */ int parseInt(const char* str); -/** A function which parsen the runtime arguments. */ -void getParameters(int argc, char* argv[]); \ No newline at end of file +/** A function which parses the runtime arguments. */ +void getParameters(int argc, char** argv); \ No newline at end of file diff --git a/include/utilities.hh b/include/utilities.hh index ac0aeb0..9e7c64a 100644 --- a/include/utilities.hh +++ b/include/utilities.hh @@ -1,5 +1,15 @@ #pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include + // Define CPU related macros #if defined CPU_ARMPL #define CPU_LIB_NAME "Arm Performance Libraries" @@ -53,6 +63,12 @@ enum class gpuOffloadType : uint8_t { unified, }; +enum class matrixType : uint8_t { + rmat = 0, + random, + finiteElements, +}; + // Define struct which contains a runtime, checksum value, and gflop/s value struct time_checksum_gflop { double runtime = 0.0; @@ -76,4 +92,466 @@ struct cpuGpu_offloadThreshold { // performed. extern "C" { int consume(void* a, void* b, void* c); -} \ No newline at end of file +} + + +template +void printCSR(uint64_t nRows, + uint64_t nnz, + const int_type* rows, + const int_type* cols, + const fp_type* vals) { + std::cout << "ROWS:" << std::endl; + std::cout << "\t["; + for (uint64_t i = 0; i <= nRows; i++) { + std::cout << rows[i]; + if (i < nRows) { + std::cout << ", "; + } + } + std::cout << "]" << std::endl; + + std::cout << "COLS:" << std::endl; + std::cout << "\t["; + for (uint64_t i = 0; i < nnz; i++) { + std::cout << cols[i]; + if (i < nnz - 1) { + std::cout << ", "; + } + } + std::cout << "]" << std::endl; + + std::cout << "VALS:" << std::endl; + std::cout << "\t["; + for (uint64_t i = 0; i < nnz; i++) { + std::cout << vals[i]; + if (i < nnz - 1) { + std::cout << ", "; + } + } + std::cout << "]" << std::endl; +} + +template +void checkCSRValid(uint64_t nRows, + uint64_t nCols, + uint64_t nnz, + const int_type* rows, + const int_type* cols, + const fp_type* vals) { + if (rows[0] != 0) { + std::cerr << "[ERROR]: CSR INVALID - row_pointer[0] is not 0" << std::endl; + printCSR(nRows, nnz, rows, cols, vals); + exit(1); + } + + for (uint64_t r = 0; r < nRows; r++) { + if (rows[r] > rows[r + 1]) { + std::cerr << "[ERROR]: CSR INVALID - row_pointer[" << r << "] > row_pointer[" << (r + 1) << "]" << std::endl; + printCSR(nRows, nnz, rows, cols, vals); + exit(1); + } + } + + if (rows[nRows] != (int_type)nnz) { + std::cerr << "[ERROR]: CSR INVALID - row_pointer[nRows] != nnz" << std::endl; + printCSR(nRows, nnz, rows, cols, vals); + exit(1); + } + + for (uint64_t i = 0; i < nnz; i++) { + if (cols[i] < 0 || cols[i] >= (int_type)nCols) { + std::cerr << "[ERROR]: CSR INVALID - column index out of bounds" << std::endl; + printCSR(nRows, nnz, rows, cols, vals); + exit(1); + } + } + + for (uint64_t r = 0; r < nRows; r++) { + for (int_type j = rows[r]; j + 1 < (rows[r + 1]); j++) { + if (cols[j] > cols[j + 1]) { + std::cerr << "[ERROR]: CSR INVALID - column indices not sorted in row " << r << std::endl; + printCSR(nRows, nnz, rows, cols, vals); + exit(1); + } + if (cols[j] == cols[j + 1]) { + std::cerr << "[ERROR]: CSR INVALID - duplicate column indices in row " << r << std::endl; + printCSR(nRows, nnz, rows, cols, vals); + exit(1); + } + } + } +} + + +/** + * @brief Generate an R-MAT matrix directly in CSR format. + * + * This function samples `nnz` edges (nonzeros) from the R-MAT distribution and + * writes the result directly into CSR arrays: + * - vals[k] : value of the k-th nonzero (here set to 1 by default) + * - cols[k] : column index of the k-th nonzero + * - rows[i] : starting offset in (vals, cols) for row i + * (classic CSR row pointer of length nrows+1) + * + * Memory usage is O(nnz) (plus a temporary edge list), avoiding any dense + * matrix construction. + * + * IMPORTANT BEHAVIOR: + * - Undirected: When `undirected == true`, this code only enforces u <= v + * during sampling (so edges are oriented consistently). It does NOT insert + * the symmetric counterpart (v,u). If you want a symmetric matrix, you must + * explicitly duplicate edges (except diagonal) before CSR conversion. + * - Seeding: Uses a global or external `SEED` to make the generator + * deterministic/reproducible. Ensure `SEED` is defined in your translation unit. + * + * Complexity: + * - Sampling: O(nnz * log(max(nrows,ncols))) bit-decisions per edge + * - Sorting: O(nnz log nnz) (by row, then col) + * - CSR build: O(nnz + nrows) + * + * @tparam T Numeric type for values (e.g., float, double, int) + * @tparam int_type Integer type for indices (e.g., int, int32_t, int64_t) + * + * @param vals Output array of length nnz (nonzero values) + * @param cols Output array of length nnz (column indices) + * @param rows Output array of length nrows+1 (row pointer) + * @param nrows Number of rows in the matrix + * @param ncols Number of columns in the matrix + * @param nnz Number of nonzeros to generate + * @param a,b,c,d R-MAT quadrant probabilities (must sum to 1; typical: 0.57,0.19,0.19,0.05) + * @param noise Optional jitter in probabilities each bit step (0.0 = none) + * @param no_self_loops If true, edges with u == v are discarded and resampled + * @param undirected If true, enforce u <= v in the sampled edge; does NOT mirror edges + * + * @note Typical R-MAT parameters for realistic graphs: + * - a=0.45, b=0.15, c=0.15, d=0.25 (Kronecker-like) + * - a=0.57, b=0.19, c=0.19, d=0.05 (more skewed) + * + * @note For sparse linear algebra benchmarks, this generates matrices with: + * - Irregular sparsity patterns (not banded/block-structured) + * - Variable row/column densities challenging load balancing + * - Realistic cache behavior representative of graph applications + * + * @warning Uses 64-bit indexing to prevent overflow for large matrices (n > 46k) + * @warning Non-thread-safe due to shared random number generator + * + * References: + * - Chakrabarti, D., Zhan, Y., & Faloutsos, C. (2004). R-MAT: A recursive model + * for graph mining. SIAM International Conference on Data Mining. + * - Leskovec, J., et al. (2010). Kronecker graphs: An approach to modeling networks. + * Journal of Machine Learning Research, 11, 985-1042. + */ +template +void rMatCSR(T* vals, int_type* cols, int_type* rows, + int_type nrows, int_type ncols, int_type nnz, + uint64_t seed = SEED, + double a = 0.57, + double b = 0.19, + double c = 0.19, + double d = 0.05, + double noise = 0.0, + bool no_self_loops = false, + bool undirected = false) { + // Number of bits needed to index into the row/col ranges. + // R-MAT decides each bit from MSB→LSB by picking a quadrant. + int row_bits = static_cast(std::ceil(std::log2(nrows))); + int col_bits = static_cast(std::ceil(std::log2(ncols))); + + // Set up RNG. Uses srand for value generation, and uniform[0,1) + // for quadrant selection + srand(seed); + std::default_random_engine gen; + std::uniform_real_distribution dist(0.0, 1.0); + gen.seed(seed); + + + // Temporary storage of sampled edges as (row, col) pairs. + // We reserve exactly nnz slots and will push_back exactly nnz valid edges. + std::vector> edges; + edges.reserve(nnz); + + // Keep sampling until we have nnz valid edges. + // Invalid candidates (out-of-bounds due to non-powers-of-two, self-loops, etc.) + // are discarded by continuing the loop without incrementing the edge count. + int edge_idx = 0; + while (edge_idx < nnz) { + int u = 0; // Sampled row index (as int, cast to int_type later) + int v = 0; // Sampled column index + + // Base quadrant probabilities (A,B,C,D). We optionally jitter these + // at each bit decision if 'noise' > 0. + double A = a, B = b, C = c, D = d; + + // For each bit (from most-significant to least), decide which quadrant + // the edge falls into and set the corresponding bit of (u,v). + for (int bit = 0; bit < std::max(row_bits, col_bits); ++bit) { + // Optional noise: perturb A,B,C,D slightly, then renormalize. + if (noise > 0.0) { + auto jitter = [&](double val) { + // Perturb within ±noise, clamp to [0,1] lower bound via max(0,•) + return std::max(0.0, val + (dist(gen) * 2.0 - 1.0) * noise); + }; + A = jitter(a); + B = jitter(b); + C = jitter(c); + D = jitter(d); + double sum = A + B + C + D; + // Guard against degenerate total (shouldn’t happen unless noise is extreme) + A = (sum > 0) ? (A / sum) : 0.25; + B = (sum > 0) ? (B / sum) : 0.25; + C = (sum > 0) ? (C / sum) : 0.25; + D = (sum > 0) ? (D / sum) : 0.25; + } + + // Draw r ~ U(0,1) and select quadrant by cumulative thresholds. + double r = dist(gen); + double t1 = A; + double t2 = A + B; + double t3 = A + B + C; + + int row_bit = 0, col_bit = 0; + if (r < t1) { + // Quadrant 00 + row_bit = 0; col_bit = 0; + } else if (r < t2) { + // Quadrant 01 + row_bit = 0; col_bit = 1; + } else if (r < t3) { + // Quadrant 10 + row_bit = 1; col_bit = 0; + } else { + // Quadrant 11 + row_bit = 1; col_bit = 1; + } + + // Only set bits that are within the bit-width of rows/cols respectively. + if (bit < row_bits) u = (u << 1) | row_bit; + if (bit < col_bits) v = (v << 1) | col_bit; + } + + // If dimensions are not powers of two, some combinations will exceed bounds. + if (u >= nrows) u = u % nrows; + if (v >= ncols) v = v % ncols; + // If undirected, orient edges consistently (store the "upper-triangular" orientation). + // NOTE: This does NOT create symmetric pairs; it only enforces a canonical ordering. + if (undirected && u > v) std::swap(u, v); + // If a duplicate, do not commit edge + if (std::find(edges.begin(), edges.end(), std::make_pair((int_type)u, (int_type)v)) != edges.end()) continue; + + // Commit the sampled edge. + edges.emplace_back((int_type)u, (int_type)v); + ++edge_idx; + } + + + // Sort edges primarily by row, and secondarily by column. + // CSR expects nonzeros grouped by row; sorting also makes columns within + // each row non-decreasing, which is often desirable. + std::sort(edges.begin(), edges.end(), + [](auto& a, auto& b) { + return (a.first < b.first) || + (a.first == b.first && a.second < b.second); + }); + + // Initialize row pointer array with zeros. + // rows[i] will eventually hold the starting index in (vals, cols) of row i. + // rows[nrows] will equal nnz after prefix-sum (the total number of nonzeros). + for (size_t i = 0; i < static_cast(nrows + 1); i++) rows[i] = 0; + + // Linear pass over sorted edges to fill cols/vals and count entries per row. + // We write the k-th edge's column into cols[k] and its value into vals[k]. + // Simultaneously, we increment a per-row count into rows[r+1]. + for (size_t i = 0; i < static_cast(nnz); ++i) { + const int_type r = edges[static_cast(i)].first; + const int_type c = edges[static_cast(i)].second; + + cols[static_cast(i)] = c; + vals[static_cast(i)] = (T)((double)(rand() % 100) / 3.0); + + // Count one nonzero in row r by bumping rows[r+1]. + // After this loop, rows[k+1] holds the count of nonzeros in row k. + rows[static_cast(r) + 1]++; + } + for (size_t i = 0; i < static_cast(nrows); i++) { + rows[static_cast(i) + 1] += rows[static_cast(i)]; + } + + checkCSRValid(nrows, ncols, nnz, rows, cols, vals); +} + +template +void randomCSR(T* vals, int_type* cols, int_type* rows, + int nrows, int ncols, int nnz, unsigned int seed = SEED) { + if ((int64_t)nnz >= (int64_t)nrows * (int64_t)ncols) { + std::cerr << "ERROR: nnz exceeds maximum possible non-zeros." << std::endl; + exit(1); + } else if (nnz <= 0) { + std::cerr << "ERROR: nnz must be positive." << std::endl; + exit(1); + } + + srand(seed); + std::default_random_engine gen; + std::uniform_int_distribution col_dist(0, ncols - 1); + gen.seed(seed); + + // Generate number of non-zeros per row + std::vector row_counts(nrows, 0); + int total_nonzeros = 0; + while (total_nonzeros < nnz) { + int_type r = rand() % nrows; + if (row_counts[r] >= ncols) continue; // Skip if row is already full + row_counts[r]++; + total_nonzeros++; + } + + // Create the row pointer array + rows[0] = 0; + for (int r = 0; r < nrows; r++) { + rows[r + 1] = rows[r] + row_counts[r]; + } + + int index = 0; + // Make a bitmap of the columns that are going to be used in this row + std::vector rCols(ncols, false); + for (int r = 0; r < nrows; r++) { + int c = 0; + while (c < row_counts[r]) { + int_type col = col_dist(gen); + if (!rCols[col]) { + rCols[col] = true; + c++; + } + } + // Create the column index array + for (int_type cIndex = 0; cIndex < ncols; cIndex++) { + if (rCols[cIndex]) { + cols[index] = cIndex; + index++; + rCols[cIndex] = false; // Reset the bitmap for the next row + } + } + } + + // Randomise the values array + index = 0; + for (int r = 0; r < nrows; r++) { + for (int j = 0; j < row_counts[r]; j++) { + vals[index] = (T)((double)(rand() % 100) / 3.0); + index++; + } + } + checkCSRValid(nrows, ncols, nnz, rows, cols, vals); +} + +template +int64_t calcCNNZ(int_type A_n_rows, int_type A_nnz, int_type* A_rows, int_type* A_cols, + int_type B_n_cols, int_type B_nnz, int_type* B_rows, int_type* B_cols) { + int64_t C_nnz = 0; + + for (int_type i = 0; i < A_n_rows; i++) { + for (int_type j = A_rows[i]; j < A_rows[i + 1]; j++) { + int_type a_col = A_cols[j]; + if (a_col < 0 || a_col >= B_n_cols) { + std::cerr << "[ERROR]: calcCNNZ - A column index out of bounds for B" << std::endl; + continue; + } + for (int_type k = B_rows[a_col]; k < B_rows[a_col + 1]; k++) { + if (B_cols[k] == i) { + C_nnz++; + break; + } + } + } + } + + return C_nnz; +} + +/** + * @brief Generates a densely-filled banded matrix. + * + * It first calculates the minimum bandwidth 'k' required to store at least + * 'nnz' elements. It then fills the band (diagonals -k to +k) row by row, + * respecting matrix boundaries, until exactly 'nnz' elements are written. + */ +template +void finiteElementCSR(T* vals, int_type* cols, int_type* rows, + int nrows, int ncols, int_type nnz, + unsigned int seed = SEED) +{ + long long max_nnz = (long long)nrows * ncols; + if (nnz > max_nnz) { + std::cerr << "Warning: Clamping NNZ." << std::endl; + nnz = max_nnz; + } + + if (nnz == 0) { + for (int r = 0; r <= nrows; r++) rows[r] = 0; + return; + } + + std::mt19937 gen(seed); + std::uniform_real_distribution val_dist(-1.5, 1.5); + + // --- 1. Find the bandwidth 'k' needed to fit 'nnz' --- + int_type k = 0; // k is the "radius" of the band + long long nnz_in_band = 0; + while (nnz_in_band < nnz) { + nnz_in_band = 0; + for (int r = 0; r < nrows; r++) { + int_type c_midpoint = r * ncols / nrows; + int_type c_min = std::max(0, c_midpoint - k); + int_type c_max = std::min(ncols - 1, c_midpoint + k); + nnz_in_band += (c_max - c_min + 1); + } + + if (nnz_in_band >= nnz) break; // Found a big enough band + + k++; + + // Safety break if k grows larger than the matrix + if (k > std::max(nrows, ncols)) { + std::cerr << "Warning: Bandwidth loop failed. Clamping NNZ." << std::endl; + nnz = nnz_in_band; // nnz is now the max possible + break; + } + } + + // --- 2. Fill the CSR arrays using the discovered bandwidth 'k' --- + rows[0] = 0; + int_type current_nnz = 0; + + for (int r = 0; r < nrows; r++) { + // Find the correct column bounds for this row + int_type c_midpoint = r * ncols / nrows; + int_type c_min = std::max(0, c_midpoint - k); + int_type c_max = std::min(ncols - 1, c_midpoint + k); + + // Fill the band for this row + for (int_type c = c_min; c <= c_max; c++) { + // Stop *exactly* at nnz + if (current_nnz >= nnz) { + break; + } + + vals[current_nnz] = val_dist(gen); + cols[current_nnz] = c; + current_nnz++; + } + + rows[r + 1] = current_nnz; + + if (current_nnz >= nnz) { + // We're done. Fill the rest of the row pointers. + for (int rest_r = r + 1; rest_r < nrows; rest_r++) { + rows[rest_r + 1] = nnz; + } + break; // Exit the main row loop + } + } + + // Ensure the final pointer is correct + rows[nrows] = current_nnz; +} diff --git a/oneMKL/CPU/gemm.hh b/oneMKL/CPU/gemm.hh index bdb7ba5..0ae554a 100644 --- a/oneMKL/CPU/gemm.hh +++ b/oneMKL/CPU/gemm.hh @@ -50,8 +50,7 @@ class gemm_cpu : public gemm { std::max(1, m_)); } else { // Un-specialised class will not do any work - print error and exit. - std::cout << "ERROR - Datatype for OneMKL CPU GEMM kernel not supported." - << std::endl; + std::cerr << "ERROR - Datatype for OneMKL CPU GEMM kernel not supported." << std::endl; exit(1); } // Ensure compiler doesn't optimise away the work being done diff --git a/oneMKL/CPU/gemv.hh b/oneMKL/CPU/gemv.hh index b53a83c..8a5a013 100644 --- a/oneMKL/CPU/gemv.hh +++ b/oneMKL/CPU/gemv.hh @@ -47,8 +47,7 @@ class gemv_cpu : public gemv { std::max(1, m_), x_, vecIncrement_, beta, y_, vecIncrement_); } else { // Un-specialised class will not do any work - print error and exit. - std::cout << "ERROR - Datatype for OneMKL CPU GEMV kernel not supported." - << std::endl; + std::cout << "ERROR - Datatype for OneMKL CPU GEMV kernel not supported." << std::endl; exit(1); } // Ensure compiler doesn't optimise away the work being done diff --git a/oneMKL/CPU/spmdnm.hh b/oneMKL/CPU/spmdnm.hh new file mode 100644 index 0000000..6e5a132 --- /dev/null +++ b/oneMKL/CPU/spmdnm.hh @@ -0,0 +1,171 @@ +#pragma once + +#ifdef CPU_ONEMKL +#include + +#include +#include + +#include "../../include/kernels/CPU/spmdnm.hh" +#include "../../include/utilities.hh" + +namespace cpu { +/** A class for sparse matrix-dense matrix BLAS kernels. */ +template +class spmdnm_cpu : public spmdnm { +public: + using spmdnm::spmdnm; + using spmdnm::callConsume; + using spmdnm::initInputMatrices; + using spmdnm::m_; + using spmdnm::n_; + using spmdnm::k_; + using spmdnm::B_; + using spmdnm::C_; + using spmdnm::sparsity_; + using spmdnm::type_; + using spmdnm::nnz_; + + void initialise(int m, int n, int k, double sparsity, + matrixType type, bool binary = false) { + m_ = m; + n_ = n; + k_ = k; + + m_mkl_ = m; + n_mkl_ = n; + k_mkl_ = k; + + sparsity_ = sparsity; + type_ = type; + + /** Determine the number of nnz elements in A and B */ + nnz_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_)); + B_ = (T*)mkl_malloc(sizeof(T) * k_ * n_, 64); + C_ = (T*)mkl_malloc(sizeof(T) * m_ * n_, 64); + + initInputMatrices(); + } + +protected: + void toSparseFormat() override { + A_vals_ = (T*)mkl_malloc(sizeof(T) * nnz_, 64); + A_cols_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * nnz_, 64); + // Make a temporary rows array of the ususal CSR type, to then turn into the two-array MKL version + MKL_INT* A_rows_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * (m_ + 1), 64); + A_rowsb_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * m_, 64); + A_rowse_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * m_, 64); + + if (type_ == matrixType::rmat) { + rMatCSR(A_vals_, A_cols_, A_rows_, m_, k_, nnz_); + } else if (type_ == matrixType::random) { + randomCSR(A_vals_, A_cols_, A_rows_, m_, k_, nnz_); + } else if (type_ == matrixType::finiteElements) { + finiteElementCSR(A_vals_, A_cols_, A_rows_, m_, k_, nnz_); + } else { + std::cerr << "Unknown matrix type" << std::endl; + exit(1); + } + + for (uint64_t i = 0; i < m_; i++) { + A_rowsb_[i] = A_rows_[i]; + A_rowse_[i] = A_rows_[i + 1]; + } + // Clean up the temporary array + mkl_free(A_rows_); + } + +private: + void preLoopRequirements() override { + if constexpr (std::is_same_v) { + status_ = mkl_sparse_s_create_csr(&A_csr_, + indexing_, + m_, + k_, + A_rowsb_, + A_rowse_, + A_cols_, + A_vals_); + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + } else if constexpr (std::is_same_v) { + status_ = mkl_sparse_d_create_csr(&A_csr_, + indexing_, + m_, + k_, + A_rowsb_, + A_rowse_, + A_cols_, + A_vals_); + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + } + } + + void callSpmdnm() override { + if constexpr (std::is_same_v) { + status_ = mkl_sparse_s_mm(operation_, alpha, A_csr_, description_, + layout_, B_, n_mkl_, n_mkl_, beta, C_, + n_mkl_); + } else if constexpr (std::is_same_v) { + status_ = mkl_sparse_d_mm(operation_, alpha, A_csr_, description_, + layout_, B_, n_mkl_, n_mkl_, beta, C_, + n_mkl_); + } else { + // Un-specialised class will not do any work - print error and exit. + std::cerr << "ERROR - Datatype for OneMKL CPU SpGEMV kernel not " + "supported." << std::endl; + exit(1); + } + + callConsume(); + } + + void postLoopRequirements() override { + status_ = mkl_sparse_destroy(A_csr_); + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cerr << "ERROR " << status_ << std::endl; + exit(1); + } + } + + void postCallKernelCleanup() override { + mkl_free(A_rowsb_); + mkl_free(A_rowse_); + mkl_free(A_cols_); + mkl_free(A_vals_); + mkl_free(B_); + mkl_free(C_); + } + + sparse_status_t status_; + + sparse_index_base_t indexing_ = SPARSE_INDEX_BASE_ZERO; + sparse_operation_t operation_ = SPARSE_OPERATION_NON_TRANSPOSE; + + matrix_descr description_ = {SPARSE_MATRIX_TYPE_GENERAL, + SPARSE_FILL_MODE_LOWER, + SPARSE_DIAG_NON_UNIT}; + sparse_layout_t layout_ = SPARSE_LAYOUT_ROW_MAJOR; + + MKL_INT m_mkl_; + MKL_INT n_mkl_; + MKL_INT k_mkl_; + + T* A_vals_; + MKL_INT* A_cols_; + MKL_INT* A_rowsb_; + MKL_INT* A_rowse_; + + sparse_matrix_t A_csr_; + + const T alpha = ALPHA; + const T beta = BETA; +}; +} + +#endif \ No newline at end of file diff --git a/oneMKL/CPU/spmdnv.hh b/oneMKL/CPU/spmdnv.hh new file mode 100644 index 0000000..a5414f9 --- /dev/null +++ b/oneMKL/CPU/spmdnv.hh @@ -0,0 +1,151 @@ +#pragma once + +#ifdef CPU_ONEMKL +#include + +#include + +#include "../../include/kernels/CPU/spmdnv.hh" +#include "../../include/utilities.hh" + +namespace cpu { +template +class spmdnv_cpu : public spmdnv { +public: + using spmdnv::spmdnv; + using spmdnv::callConsume; + using spmdnv::initInputMatrixVector; + using spmdnv::m_; + using spmdnv::n_; + using spmdnv::x_; + using spmdnv::y_; + using spmdnv::sparsity_; + using spmdnv::type_; + using spmdnv::nnz_; + + void initialise(int m, int n, double sparsity, matrixType type, + bool binary = false) { + m_ = m; + n_ = n; + sparsity_ = sparsity; + type_ = type; + + nnz_ = 1 + (uint64_t)((double)m_ * (double)n_ * (1.0 - sparsity_)); + + x_ = (T*)mkl_malloc(sizeof(T) * n_, 64); + y_ = (T*)mkl_malloc(sizeof(T) * m_, 64); + + initInputMatrixVector(); + } + +protected: + void toSparseFormat() override { + A_vals_ = (T*)mkl_malloc(sizeof(T) * nnz_, 64); + A_cols_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * nnz_, 64); + MKL_INT* A_rows_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * (m_ + 1), 64); + A_rowsb_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * m_, 64); + A_rowse_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * m_, 64); + + if (type_ == matrixType::rmat) { + rMatCSR(A_vals_, A_cols_, A_rows_, m_, n_, nnz_); + } else if (type_ == matrixType::random) { + randomCSR(A_vals_, A_cols_, A_rows_, m_, n_, nnz_); + } else if (type_ == matrixType::finiteElements) { + finiteElementCSR(A_vals_, A_cols_, A_rows_, m_, n_, nnz_); + } else { + std::cerr << "Unknown matrix type" << std::endl; + exit(1); + } + + for (uint64_t i = 0; i < m_; i++) { + A_rowsb_[i] = A_rows_[i]; + A_rowse_[i] = A_rows_[i + 1]; + } + + mkl_free(A_rows_); + } + +private: + void preLoopRequirements() override { + if constexpr (std::is_same_v) { + status_ = mkl_sparse_s_create_csr(&A_csr_, + indexing_, + m_, + n_, + A_rowsb_, + A_rowse_, + A_cols_, + A_vals_); + } else if constexpr (std::is_same_v) { + status_ = mkl_sparse_d_create_csr(&A_csr_, + indexing_, + m_, + n_, + A_rowsb_, + A_rowse_, + A_cols_, + A_vals_); + } + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cout << "ERROR " << status_ << std::endl; + exit(1); + } + } + + void callSpMDnV() override { + if constexpr (std::is_same_v) { + status_ = mkl_sparse_s_mv(operation_, alpha, A_csr_, description_, x_, + beta, y_); + } else if constexpr (std::is_same_v) { + status_ = mkl_sparse_d_mv(operation_, alpha, A_csr_, description_, x_, + beta, y_); + } + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cerr << "ERROR " << status_ << std::endl; + exit(1); + } + callConsume(); + } + + void postLoopRequirements() override { + status_ = mkl_sparse_destroy(A_csr_); + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cerr << "ERROR " << status_ << std::endl; + exit(1); + } + } + + void postCallKernelCleanup() override { + mkl_free(A_rowsb_); + mkl_free(A_rowse_); + mkl_free(A_cols_); + mkl_free(A_vals_); + mkl_free(x_); + mkl_free(y_); + } + + sparse_status_t status_; + + sparse_index_base_t indexing_ = SPARSE_INDEX_BASE_ZERO; + sparse_operation_t operation_ = SPARSE_OPERATION_NON_TRANSPOSE; + matrix_descr description_ = {SPARSE_MATRIX_TYPE_GENERAL, + SPARSE_FILL_MODE_LOWER, + SPARSE_DIAG_NON_UNIT}; + + MKL_INT m_mkl_; + MKL_INT n_mkl_; + + T* A_vals_; + MKL_INT* A_cols_; + MKL_INT* A_rowsb_; + MKL_INT* A_rowse_; + + sparse_matrix_t A_csr_; + + const T alpha = ALPHA; + const T beta = BETA; +}; +} + + +#endif diff --git a/oneMKL/CPU/spmspm.hh b/oneMKL/CPU/spmspm.hh new file mode 100644 index 0000000..2671ebb --- /dev/null +++ b/oneMKL/CPU/spmspm.hh @@ -0,0 +1,256 @@ +#pragma once + +#ifdef CPU_ONEMKL +#include +#include + +#include + +#include "../../include/kernels/CPU/spmspm.hh" +#include "../../include/utilities.hh" + +namespace cpu { +/** A class for sparse matrix-sparse matrix CPU BLAS kernels. */ +template +class spmspm_cpu : public spmspm { +public: + using spmspm::spmspm; + using spmspm::initInputMatrices; + using spmspm::callConsume; + using spmspm::m_; + using spmspm::n_; + using spmspm::k_; + using spmspm::sparsity_; + using spmspm::type_; + using spmspm::A_nnz_; + using spmspm::B_nnz_; + using spmspm::C_nnz_; + using spmspm::C_vals_; + + void initialise(int m, int n, int k, double sparsity, + matrixType type, bool binary = false) { + m_ = m; + n_ = n; + k_ = k; + + m_mkl_ = m; + n_mkl_ = n; + k_mkl_ = k; + + sparsity_ = sparsity; + type_ = type; + + /** Determine the number of nnz elements in A and B */ + A_nnz_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_)); + B_nnz_ = 1 + (uint64_t)((double)k_ * (double)n_ * (1.0 - sparsity_)); + initInputMatrices(); + } + +protected: + void toSparseFormat() override { + A_vals_ = (T*)mkl_malloc(sizeof(T) * A_nnz_, 64); + A_cols_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * A_nnz_, 64); + MKL_INT* A_rows = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * (m_ + 1), 64); + A_rowsb_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * m_, 64); + A_rowse_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * m_, 64); + B_vals_ = (T*)mkl_malloc(sizeof(T) * B_nnz_, 64); + B_cols_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * B_nnz_, 64); + MKL_INT* B_rows = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * (k_ + 1), 64); + B_rowsb_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * k_, 64); + B_rowse_ = (MKL_INT*)mkl_malloc(sizeof(MKL_INT) * k_, 64); + + int seedOffset = 0; + do { + if (type_ == matrixType::rmat) { + rMatCSR(A_vals_, A_cols_, A_rows, m_, k_, A_nnz_, SEED + seedOffset++); + rMatCSR(B_vals_, B_cols_, B_rows, k_, n_, B_nnz_, SEED + seedOffset++); + } else if (type_ == matrixType::random) { + randomCSR(A_vals_, A_cols_, A_rows, m_, k_, A_nnz_, SEED + seedOffset++); + randomCSR(B_vals_, B_cols_, B_rows, k_, n_, B_nnz_, SEED + seedOffset++); + } else if (type_ == matrixType::finiteElements) { + finiteElementCSR(A_vals_, A_cols_, A_rows, m_, k_, A_nnz_, SEED + seedOffset++); + finiteElementCSR(B_vals_, B_cols_, B_rows, k_, n_, B_nnz_, SEED + seedOffset++); + } else { + std::cerr << "Unknown matrix type" << std::endl; + exit(1); + } + } while (calcCNNZ(m_, A_nnz_, A_rows, A_cols_, k_, B_nnz_, B_rows, B_cols_) == 0); + + for (uint64_t i = 0; i < m_; i++) { + A_rowsb_[i] = A_rows[i]; + A_rowse_[i] = A_rows[i + 1]; + } + + mkl_free(A_rows); + + for (uint64_t i = 0; i < k_; i++) { + B_rowsb_[i] = B_rows[i]; + B_rowse_[i] = B_rows[i + 1]; + } + mkl_free(B_rows); + } + +private: + + void preLoopRequirements() override { + if constexpr (std::is_same_v) { + status_ = mkl_sparse_s_create_csr(&A_csr_, + indexing_, + m_, + k_, + A_rowsb_, + A_rowse_, + A_cols_, + A_vals_); + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cerr << "ERROR " << status_ << std::endl; + exit(1); + } + + status_ = mkl_sparse_s_create_csr(&B_csr_, + indexing_, + k_, + n_, + B_rowsb_, + B_rowse_, + B_cols_, + B_vals_); + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cerr << "ERROR " << status_ << std::endl; + exit(1); + } + } else if constexpr (std::is_same_v) { + status_ = mkl_sparse_d_create_csr(&A_csr_, + indexing_, + m_, + k_, + A_rowsb_, + A_rowse_, + A_cols_, + A_vals_); + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cerr << "ERROR " << status_ << std::endl; + exit(1); + } + + + status_ = mkl_sparse_d_create_csr(&B_csr_, + indexing_, + k_, + n_, + B_rowsb_, + B_rowse_, + B_cols_, + B_vals_); + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cerr << "ERROR " << status_ << std::endl; + exit(1); + } + } + } + + void callSpmspm() override { + status_ = mkl_sparse_spmm(operation_, A_csr_, B_csr_, &C_csr_); + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cerr << "ERROR " << status_ << std::endl; + exit(1); + } + + status_ = mkl_sparse_order(C_csr_); + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cerr << "ERROR " << status_ << std::endl; + exit(1); + } + callConsume(); + } + + void postLoopRequirements() override { + if constexpr(std::is_same_v) { + status_ = mkl_sparse_s_export_csr(C_csr_, + &indexing_, + &m_mkl_, + &n_mkl_, + &C_rowsb_, + &C_rowse_, + &C_cols_, + &C_vals_); + } else if constexpr (std::is_same_v) { + status_ = mkl_sparse_d_export_csr(C_csr_, + &indexing_, + &m_mkl_, + &n_mkl_, + &C_rowsb_, + &C_rowse_, + &C_cols_, + &C_vals_); + } + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cerr << "ERROR " << status_ << std::endl; + exit(1); + } + + C_nnz_ = C_rowse_[m_ - 1]; + } + + void postCallKernelCleanup() override { + status_ = mkl_sparse_destroy(A_csr_); + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cerr << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = mkl_sparse_destroy(B_csr_); + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cerr << "ERROR " << status_ << std::endl; + exit(1); + } + status_ = mkl_sparse_destroy(C_csr_); + if (status_ != SPARSE_STATUS_SUCCESS) { + std::cerr << "ERROR " << status_ << std::endl; + exit(1); + } + + mkl_free(A_vals_); + mkl_free(A_cols_); + mkl_free(A_rowsb_); + mkl_free(A_rowse_); + + mkl_free(B_vals_); + mkl_free(B_cols_); + mkl_free(B_rowsb_); + mkl_free(B_rowse_); + } + + sparse_status_t status_; + + sparse_index_base_t indexing_ = SPARSE_INDEX_BASE_ZERO; + sparse_operation_t operation_ = SPARSE_OPERATION_NON_TRANSPOSE; + + MKL_INT m_mkl_; + MKL_INT n_mkl_; + MKL_INT k_mkl_; + + T* A_vals_; + MKL_INT* A_cols_; + MKL_INT* A_rowsb_; + MKL_INT* A_rowse_; + + T* B_vals_; + MKL_INT* B_cols_; + MKL_INT* B_rowsb_; + MKL_INT* B_rowse_; + + MKL_INT* C_cols_; + MKL_INT* C_rowsb_; + MKL_INT* C_rowse_; + + sparse_matrix_t A_csr_; + sparse_matrix_t B_csr_; + sparse_matrix_t C_csr_; + + const T alpha = ALPHA; + const T beta = BETA; +}; +} + + +#endif \ No newline at end of file diff --git a/oneMKL/GPU/common.hh b/oneMKL/GPU/common.hh index 30fccfa..13f3715 100644 --- a/oneMKL/GPU/common.hh +++ b/oneMKL/GPU/common.hh @@ -3,8 +3,9 @@ #ifdef GPU_ONEMKL #include - +#include #include +#include #include // Create an exception handler for asynchronous SYCL exceptions @@ -14,8 +15,7 @@ static const std::function exception_handler = try { std::rethrow_exception(e); } catch (std::exception const& e) { - std::cout << "ERROR - Caught asynchronous SYCL exception : " - << e.what() << std::endl; + std::cerr << "ERROR - Caught asynchronous SYCL exception : " << e.what() << std::endl; } } }; diff --git a/oneMKL/GPU/gemm.hh b/oneMKL/GPU/gemm.hh index 44fa3b2..deb7723 100644 --- a/oneMKL/GPU/gemm.hh +++ b/oneMKL/GPU/gemm.hh @@ -111,10 +111,9 @@ class gemm_gpu : public gemm { (int64_t)std::max(1, m_), {}) .wait_and_throw(); } catch (sycl::exception const& e) { - std::cout << "ERROR - Caught synchronous SYCL exception during GEMM " - "(Always):\n" - << e.what() << std::endl - << "OpenCL status: " << e.code().value() << std::endl; + std::cerr << "ERROR - Caught synchronous SYCL exception during GEMM " + "(Always):\n" << e.what() << std::endl; + std::cerr << "OpenCL status: " << e.code().value() << std::endl; } // Offload output data from device to host gpuQueue_.memcpy(C_, C_device_, sizeof(T) * m_ * n_); @@ -131,10 +130,9 @@ class gemm_gpu : public gemm { (int64_t)std::max(1, m_), {}) .wait_and_throw(); } catch (sycl::exception const& e) { - std::cout << "ERROR - Caught synchronous SYCL exception during GEMM " - "(Once):\n" - << e.what() << std::endl - << "OpenCL status: " << e.code().value() << std::endl; + std::cerr << "ERROR - Caught synchronous SYCL exception during GEMM " + "(Once): " << e.what() << std::endl; + std::cerr << "OpenCL status: " << e.code().value() << std::endl; } break; } @@ -147,10 +145,9 @@ class gemm_gpu : public gemm { (int64_t)std::max(1, k_), beta, C_, (int64_t)std::max(1, m_), {}) .wait_and_throw(); } catch (sycl::exception const& e) { - std::cout << "ERROR - Caught synchronous SYCL exception during GEMM " - "(Unified):\n" - << e.what() << std::endl - << "OpenCL status: " << e.code().value() << std::endl; + std::cerr << "ERROR - Caught synchronous SYCL exception during GEMM " + "(Unified): " << e.what() << std::endl; + std::cerr << "OpenCL status: " << e.code().value() << std::endl; } break; } @@ -172,7 +169,7 @@ class gemm_gpu : public gemm { break; } case gpuOffloadType::unified: { - // TODO - Ensure all data resides on host once work has completed + // Ensure all data resides on host once work has completed gpuQueue_.wait_and_throw(); break; } diff --git a/oneMKL/GPU/gemv.hh b/oneMKL/GPU/gemv.hh index ffe9f6c..6c3264b 100644 --- a/oneMKL/GPU/gemv.hh +++ b/oneMKL/GPU/gemv.hh @@ -109,10 +109,9 @@ class gemv_gpu : public gemv { y_device_, vecIncrement_, {}) .wait_and_throw(); } catch (sycl::exception const& e) { - std::cout << "ERROR - Caught synchronous SYCL exception during GEMV " - "(Always):\n" - << e.what() << std::endl - << "OpenCL status: " << e.code().value() << std::endl; + std::cerr << "ERROR - Caught synchronous SYCL exception during GEMV " + "(Always):\n" << e.what() << std::endl; + std::cerr << "OpenCL status: " << e.code().value() << std::endl; } // Offload output data from device to host gpuQueue_.memcpy(y_, y_device_, sizeof(T) * m_); @@ -128,10 +127,9 @@ class gemv_gpu : public gemv { y_device_, vecIncrement_, {}) .wait_and_throw(); } catch (sycl::exception const& e) { - std::cout << "ERROR - Caught synchronous SYCL exception during GEMV " - "(Once):\n" - << e.what() << std::endl - << "OpenCL status: " << e.code().value() << std::endl; + std::cerr << "ERROR - Caught synchronous SYCL exception during GEMV " + "(Once):\n" << e.what() << std::endl; + std::cerr << "OpenCL status: " << e.code().value() << std::endl; } break; } @@ -144,10 +142,9 @@ class gemv_gpu : public gemv { vecIncrement_, {}) .wait_and_throw(); } catch (sycl::exception const& e) { - std::cout << "ERROR - Caught synchronous SYCL exception during GEMV " - "(Unified):\n" - << e.what() << std::endl - << "OpenCL status: " << e.code().value() << std::endl; + std::cerr << "ERROR - Caught synchronous SYCL exception during GEMV " + "(Unified):\n" << e.what() << std::endl; + std::cerr << "OpenCL status: " << e.code().value() << std::endl; } break; } @@ -169,7 +166,6 @@ class gemv_gpu : public gemv { break; } case gpuOffloadType::unified: { - // TODO - Ensure all data resides on host once work has completed gpuQueue_.wait_and_throw(); break; } diff --git a/oneMKL/GPU/spmdnm.hh b/oneMKL/GPU/spmdnm.hh new file mode 100644 index 0000000..b334fcd --- /dev/null +++ b/oneMKL/GPU/spmdnm.hh @@ -0,0 +1,340 @@ +#pragma once + +#ifdef GPU_ONEMKL + +#include "../../include/kernels/GPU/spmdnm.hh" +#include "../../include/utilities.hh" +#include "common.hh" + +#include + +namespace gpu { +template +class spmdnm_gpu : public spmdnm { +public: + using spmdnm::spmdnm; + using spmdnm::initInputMatrices; + using spmdnm::nnz_; + using spmdnm::m_; + using spmdnm::n_; + using spmdnm::k_; + using spmdnm::B_; + using spmdnm::C_; + using spmdnm::offload_; + using spmdnm::sparsity_; + using spmdnm::type_; + + void initialise(gpuOffloadType offload, int m, int n, int k, + double sparsity, matrixType type, + bool binary = false) override { + // Perform set-up which doesn't need to happen every problem size change. + if (firstRun_) { + firstRun_ = false; + try { + myGpu_ = sycl::device(sycl::gpu_selector_v); + } catch (const std::exception& e) { + std::cerr << "ERROR - No GPU device found: " << e.what() << '\n'; + exit(1); + } + gpuQueue_ = sycl::queue(myGpu_, exception_handler); + } + + try { + // Initialize ALL pointers to nullptr FIRST + B_ = nullptr; + C_ = nullptr; + A_vals_ = nullptr; + A_cols_ = nullptr; + A_rows_ = nullptr; + A_vals_device_ = nullptr; + A_cols_device_ = nullptr; + A_rows_device_ = nullptr; + B_device_ = nullptr; + C_device_ = nullptr; + + offload_ = offload; + sparsity_ = sparsity; + type_ = type; + m_ = m; + n_ = n; + k_ = k; + + layout_ = oneapi::mkl::layout::row_major; + operationA_ = oneapi::mkl::transpose::nontrans; + operationB_ = oneapi::mkl::transpose::nontrans; + index_ = oneapi::mkl::index_base::zero; + + nnz_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_)); + + if (offload_ == gpuOffloadType::unified) { + B_ = (T*)sycl::malloc_shared(sizeof(T) * k_ * n_, gpuQueue_); + C_ = (T*)sycl::malloc_shared(sizeof(T) * m_ * n_, gpuQueue_); + } else { + // Host memory allocation + B_ = (T*)sycl::malloc_host(sizeof(T) * k_ * n_, gpuQueue_); + C_ = (T*)sycl::malloc_host(sizeof(T) * m_ * n_, gpuQueue_); + + // Device memory allocation + B_device_ = (T*)sycl::malloc_device(sizeof(T) * k_ * n_, gpuQueue_); + C_device_ = (T*)sycl::malloc_device(sizeof(T) * m_ * n_, gpuQueue_); + } + initInputMatrices(); + } catch (const std::exception& e) { + std::cerr << "ERROR in initialise(): " << e.what() << std::endl; + exit(1); + } + } + + +protected: + void toSparseFormat() override { + if (offload_ == gpuOffloadType::always) { + A_vals_store_ = (T*)malloc(nnz_ * sizeof(T)); + A_cols_store_ = (int64_t*)malloc(nnz_ * sizeof(int64_t)); + A_rows_store_ = (int64_t*)malloc((m_ + 1) * sizeof(int64_t)); + if (type_ == matrixType::rmat) { + rMatCSR(A_vals_store_, A_cols_store_, A_rows_store_, m_, k_, nnz_); + } else if (type_ == matrixType::random) { + randomCSR(A_vals_store_, A_cols_store_, A_rows_store_, m_, k_, nnz_); + } else if (type_ == matrixType::finiteElements) { + finiteElementCSR(A_vals_store_, A_cols_store_, A_rows_store_, m_, k_, nnz_); + } else { + std::cerr << "ERROR - Unknown matrix type" << std::endl; + exit(1); + } + } + + + if (offload_ == gpuOffloadType::unified) { + A_vals_ = (T*)sycl::malloc_shared(sizeof(T) * nnz_, gpuQueue_); + A_cols_ = (int64_t*)sycl::malloc_shared(sizeof(int64_t) * nnz_, gpuQueue_); + A_rows_ = (int64_t*)sycl::malloc_shared(sizeof(int64_t) * (m_ + 1), gpuQueue_); + } else { + A_vals_ = (T*)sycl::malloc_host(sizeof(T) * nnz_, gpuQueue_); + A_cols_ = (int64_t*)sycl::malloc_host(sizeof(int64_t) * nnz_, gpuQueue_); + A_rows_ = (int64_t*)sycl::malloc_host(sizeof(int64_t) * (m_ + 1), gpuQueue_); + + A_vals_device_ = (T*)sycl::malloc_device(sizeof(T) * nnz_, gpuQueue_); + A_cols_device_ = (int64_t*)sycl::malloc_device(sizeof(int64_t) * nnz_, gpuQueue_); + A_rows_device_ = (int64_t*)sycl::malloc_device(sizeof(int64_t) * (m_ + 1), gpuQueue_); + } + + memcpy(A_rows_, A_rows_store_, sizeof(int64_t) * (m_ + 1)); + memcpy(A_cols_, A_cols_store_, sizeof(int64_t) * nnz_); + memcpy(A_vals_, A_vals_store_, sizeof(T) * nnz_); + } + +private: + void preLoopRequirements() override { + switch(offload_) { + case gpuOffloadType::always: break; + case gpuOffloadType::once: { + // Moving memory over to device from host + gpuQueue_.memcpy(A_vals_device_, A_vals_, sizeof(T) * nnz_); + gpuQueue_.memcpy(A_cols_device_, A_cols_, sizeof(int64_t) * nnz_); + gpuQueue_.memcpy(A_rows_device_, A_rows_, sizeof(int64_t) * (m_ + 1)); + gpuQueue_.memcpy(B_device_, B_, sizeof(T) * k_ * n_); + gpuQueue_.wait(); // Is this needed? + oneapi::mkl::sparse::init_matrix_handle(&A_device_); + oneapi::mkl::sparse::set_csr_data(gpuQueue_, + A_device_, + m_, + k_, + index_, + A_rows_device_, + A_cols_device_, + A_vals_device_); + gpuQueue_.wait_and_throw(); + break; + } + case gpuOffloadType::unified: { + // For unified memory, set up matrix handle once + oneapi::mkl::sparse::init_matrix_handle(&A_device_); + oneapi::mkl::sparse::set_csr_data(gpuQueue_, + A_device_, + m_, + k_, + index_, + A_rows_, + A_cols_, + A_vals_); + gpuQueue_.wait_and_throw(); + break; + } + } + } + + void callSpmdnm() override { + switch (offload_) { + case gpuOffloadType::always: { + // Copy data to device for this iteration + gpuQueue_.memcpy(A_vals_device_, A_vals_, sizeof(T) * nnz_); + gpuQueue_.memcpy(A_cols_device_, A_cols_, sizeof(int64_t) * nnz_); + gpuQueue_.memcpy(A_rows_device_, A_rows_, sizeof(int64_t) * (m_ + 1)); + gpuQueue_.memcpy(B_device_, B_, sizeof(T) * k_ * n_); + gpuQueue_.wait(); + + oneapi::mkl::sparse::init_matrix_handle(&A_device_); + oneapi::mkl::sparse::set_csr_data(gpuQueue_, + A_device_, + m_, + k_, + index_, + A_rows_device_, + A_cols_device_, + A_vals_device_); + gpuQueue_.wait(); + + // Do computation + try { + oneapi::mkl::sparse::gemm(gpuQueue_, + layout_, + operationA_, + operationB_, + alpha, + A_device_, + B_device_, + n_, + n_, + beta, + C_device_, + n_); + gpuQueue_.wait(); + } catch (sycl::exception const& e) { + std::cerr << "ERROR - Caught synchronous SYCL exception during " + "spmdnm (Always):\n" << e.what() << std::endl << + "OpenCL status: " << e.code().value() << std::endl; + exit(1); + } + + // Copy result back to host + gpuQueue_.memcpy(C_, C_device_, sizeof(T) * m_ * n_); + gpuQueue_.wait(); + + // Clean up matrix handle + oneapi::mkl::sparse::release_matrix_handle(gpuQueue_, &A_device_); + gpuQueue_.wait(); + break; + } + case gpuOffloadType::once: { + // Buffers already exist, just do computation + try { + oneapi::mkl::sparse::gemm(gpuQueue_, + layout_, + operationA_, + operationB_, + alpha, + A_device_, + B_device_, + n_, + n_, + beta, + C_device_, + n_); + gpuQueue_.wait(); + } catch (sycl::exception const& e) { + std::cerr << "ERROR - Caught synchronous SYCL exception during " + "spmdnm (Once):\n" << e.what() << std::endl << + "OpenCL status: " << e.code().value() << std::endl; + exit(1); + } + break; + } + case gpuOffloadType::unified: { + // Direct computation with unified memory + try { + oneapi::mkl::sparse::gemm(gpuQueue_, + layout_, + operationA_, + operationB_, + alpha, + A_device_, + B_, + n_, + n_, + beta, + C_, + n_); + gpuQueue_.wait_and_throw(); + } catch (sycl::exception const& e) { + std::cerr << "ERROR - Caught synchronous SYCL exception during spmdnm (Unified): " << e.what() << std::endl << "OpenCL status: " << e.code().value() << std::endl; + exit(1); + } + break; + } + } + } + + void postLoopRequirements() override { + // Clean up buffers that were created for the entire loop duration + if (offload_ == gpuOffloadType::once) { + gpuQueue_.memcpy(C_, C_device_, sizeof(T) * m_ * n_); + gpuQueue_.wait(); + } + + if (offload_ != gpuOffloadType::always) { + oneapi::mkl::sparse::release_matrix_handle(gpuQueue_, &A_device_); + } + } + + void postCallKernelCleanup() override { + if (offload_ == gpuOffloadType::unified) { + if (B_) { sycl::free(B_, gpuQueue_); B_ = nullptr; } + if (C_) { sycl::free(C_, gpuQueue_); C_ = nullptr; } + if (A_vals_) { sycl::free(A_vals_, gpuQueue_); A_vals_ = nullptr; } + if (A_cols_) { sycl::free(A_cols_, gpuQueue_); A_cols_ = nullptr; } + if (A_rows_) { sycl::free(A_rows_, gpuQueue_); A_rows_ = nullptr; } + + free(A_vals_store_); + free(A_cols_store_); + free(A_rows_store_); + } else { + if (B_) { sycl::free(B_, gpuQueue_); B_ = nullptr; } + if (C_) { sycl::free(C_, gpuQueue_); C_ = nullptr; } + if (A_vals_) { sycl::free(A_vals_, gpuQueue_); A_vals_ = nullptr; } + if (A_cols_) { sycl::free(A_cols_, gpuQueue_); A_cols_ = nullptr; } + if (A_rows_) { sycl::free(A_rows_, gpuQueue_); A_rows_ = nullptr; } + + if (A_vals_device_) { sycl::free(A_vals_device_, gpuQueue_); A_vals_device_ = nullptr; } + if (A_cols_device_) { sycl::free(A_cols_device_, gpuQueue_); A_cols_device_ = nullptr; } + if (A_rows_device_) { sycl::free(A_rows_device_, gpuQueue_); A_rows_device_ = nullptr; } + if (B_device_) { sycl::free(B_device_, gpuQueue_); B_device_ = nullptr; } + if (C_device_) { sycl::free(C_device_, gpuQueue_); C_device_ = nullptr; } + } + } + + bool firstRun_ = true; + + /** The GPU Device. */ + sycl::device myGpu_; + + /** The SYCL execution queue*/ + sycl::queue gpuQueue_; + + oneapi::mkl::layout layout_; + oneapi::mkl::transpose operationA_; + oneapi::mkl::transpose operationB_; + oneapi::mkl::index_base index_; + + T* A_vals_store_; + int64_t* A_cols_store_; + int64_t* A_rows_store_; + + T* A_vals_; + int64_t* A_cols_; + int64_t* A_rows_; + + oneapi::mkl::sparse::matrix_handle_t A_device_; + + T* A_vals_device_; + int64_t* A_cols_device_; + int64_t* A_rows_device_; + T* B_device_; + T* C_device_; + + const T alpha = ALPHA; + const T beta = BETA; +}; +} + +#endif diff --git a/oneMKL/GPU/spmdnv.hh b/oneMKL/GPU/spmdnv.hh new file mode 100644 index 0000000..3457aa8 --- /dev/null +++ b/oneMKL/GPU/spmdnv.hh @@ -0,0 +1,409 @@ +#pragma once + +#ifdef GPU_ONEMKL + +#include "../../include/kernels/GPU/spmdnv.hh" +#include "../../include/utilities.hh" +#include "common.hh" + +namespace gpu { +template +class spmdnv_gpu : public spmdnv { +public: + using spmdnv::spmdnv; + using spmdnv::initInputMatrixVector; + using spmdnv::nnz_; + using spmdnv::m_; + using spmdnv::n_; + using spmdnv::x_; + using spmdnv::y_; + using spmdnv::offload_; + using spmdnv::sparsity_; + using spmdnv::type_; + + + void initialise(gpuOffloadType offload, int m, int n, double sparsity, + matrixType type) + override { + try { + myGpu_ = sycl::device(sycl::gpu_selector_v); + } catch (const std::exception& e) { + std::cerr << "ERROR - No GPU device found: " << e.what() << '\n'; + std::terminate(); + } + auto exception_handler = [](sycl::exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } catch (sycl::exception const &e) { + std::cerr << "Caught asynchronous SYCL exception during sparse::gemv:\n" << e.what() << std::endl; + } + } + }; + + gpuQueue_ = sycl::queue(myGpu_, exception_handler); + context_ = gpuQueue_.get_context(); + + x_ = nullptr; + y_ = nullptr; + + offload_ = offload; + sparsity_ = sparsity; + type_ = type; + m_ = m; + n_ = n; + + index_ = oneapi::mkl::index_base::zero; + operation_ = oneapi::mkl::transpose::nontrans; + + nnz_ = 1 + (uint64_t)((double)m_ * (double)n_ * (1.0 - sparsity_)); + + if (offload_ == gpuOffloadType::unified) { + x_ = sycl::malloc_shared(n_, gpuQueue_); + y_ = sycl::malloc_shared(m_, gpuQueue_); + if (!x_ || !y_) { + std::cerr << "ERROR - Failed to allocate memory for GPU SpMDnV" << std::endl; + exit(1); + } + } else { + x_ = sycl::malloc_host(n_, gpuQueue_); + y_ = sycl::malloc_host(m_, gpuQueue_); + x_device_ = sycl::malloc_device(n_, gpuQueue_); + y_device_ = sycl::malloc_device(m_, gpuQueue_); + if (!x_ || !y_) { + std::cerr << "ERROR - Failed to allocate host memory" << std::endl; + exit(1); + } + } + gpuQueue_.wait_and_throw(); + + initInputMatrixVector(); + gpuQueue_.wait_and_throw(); + } + + +protected: + void toSparseFormat() override { + if (offload_ == gpuOffloadType::always) { + A_vals_store_ = (T*)malloc(nnz_ * sizeof(T)); + A_cols_store_ = (int64_t*)malloc(nnz_ * sizeof(int64_t)); + A_rows_store_ = (int64_t*)malloc((m_ + 1) * sizeof(int64_t)); + if (type_ == matrixType::rmat) { + rMatCSR(A_vals_store_, A_cols_store_, A_rows_store_, m_, n_, nnz_); + } else if (type_ == matrixType::random) { + randomCSR(A_vals_store_, A_cols_store_, A_rows_store_, m_, n_, nnz_); + } else if (type_ == matrixType::finiteElements) { + finiteElementCSR(A_vals_store_, A_cols_store_, A_rows_store_, m_, n_, nnz_); + } else { + std::cerr << "Matrix type not supported" << std::endl; + exit(1); + } + } + + if (offload_ == gpuOffloadType::unified) { + A_vals_ = sycl::malloc_shared(nnz_, gpuQueue_); + A_cols_ = sycl::malloc_shared(nnz_, gpuQueue_); + A_rows_ = sycl::malloc_shared(m_ + 1, gpuQueue_); + } else { + A_vals_ = sycl::malloc_host(nnz_, gpuQueue_); + A_cols_ = sycl::malloc_host(nnz_, gpuQueue_); + A_rows_ = sycl::malloc_host(m_ + 1, gpuQueue_); + A_vals_device_ = (T*)sycl::malloc_device(nnz_ * sizeof(T), gpuQueue_); + A_cols_device_ = (int64_t*)sycl::malloc_device(nnz_ * sizeof(int64_t), gpuQueue_); + A_rows_device_ = (int64_t*)sycl::malloc_device((m_ + 1) * sizeof(int64_t), gpuQueue_); + } + + memcpy(A_rows_, A_rows_store_, static_cast(m_ + 1) * sizeof(int64_t)); + memcpy(A_cols_, A_cols_store_, static_cast(nnz_) * sizeof(int64_t)); + memcpy(A_vals_, A_vals_store_, static_cast(nnz_) * sizeof(T)); + } + +private: + void preLoopRequirements() override { + if (offload_ == gpuOffloadType::once) { + gpuQueue_.memcpy(A_vals_device_, A_vals_, sizeof(T) * nnz_); + gpuQueue_.memcpy(A_cols_device_, A_cols_, sizeof(int64_t) * nnz_); + gpuQueue_.memcpy(A_rows_device_, A_rows_, sizeof(int64_t) * (m_ + 1)); + gpuQueue_.memcpy(x_device_, x_, sizeof(T) * n_); + gpuQueue_.wait_and_throw(); + } + } + + void callSpMDnV() override { + switch (offload_) { + case gpuOffloadType::always: { + gpuQueue_.memcpy(A_vals_device_, A_vals_, sizeof(T) * nnz_); + gpuQueue_.memcpy(A_cols_device_, A_cols_, sizeof(int64_t) * nnz_); + gpuQueue_.memcpy(A_rows_device_, A_rows_, sizeof(int64_t) * (m_ + 1)); + gpuQueue_.memcpy(x_device_, x_, sizeof(T) * n_); + gpuQueue_.wait_and_throw(); + // Do computation + try { + oneapi::mkl::sparse::init_matrix_handle(&handle_); + auto set = oneapi::mkl::sparse::set_csr_data(gpuQueue_, + handle_, + m_, + n_, + index_, + A_rows_device_, + A_cols_device_, + A_vals_device_); + + auto optimise = oneapi::mkl::sparse::optimize_gemv(gpuQueue_, + operation_, + handle_, + {set}); + + auto gemv = oneapi::mkl::sparse::gemv(gpuQueue_, + operation_, + alpha, + handle_, + x_device_, + beta, + y_device_, + {optimise}); + + auto release = oneapi::mkl::sparse::release_matrix_handle(gpuQueue_, &handle_, {gemv}); + release.wait_and_throw(); + } catch (sycl::exception const& e) { + gpuQueue_.wait(); + oneapi::mkl::sparse::release_matrix_handle(gpuQueue_, &handle_); + std::cerr << "ERROR - Caught synchronous SYCL exception during SPGEMV (Once):\n" << e.what() << std::endl << "OpenCL status: " << e.code().value() << std::endl; + } catch (std::exception const &e) { + std::cerr << "\t\tCaught std exception:\n" << e.what() << std::endl; + gpuQueue_.wait(); + oneapi::mkl::sparse::release_matrix_handle(gpuQueue_, &handle_); + exit(1); + } + gpuQueue_.memcpy(y_, y_device_, sizeof(T) * m_); + break; + } + case gpuOffloadType::once: { + try { + oneapi::mkl::sparse::init_matrix_handle(&handle_); + if (handle_ == nullptr) { + std::cerr << "ERROR - Failed to initialise matrix handle" << std::endl; + exit(1); + } + gpuQueue_.wait_and_throw(); + auto set = oneapi::mkl::sparse::set_csr_data(gpuQueue_, + handle_, + m_, + n_, + index_, + A_rows_device_, + A_cols_device_, + A_vals_device_, + {}); + + auto optimise = oneapi::mkl::sparse::optimize_gemv(gpuQueue_, + operation_, + handle_, + {set}); + + auto gemv = oneapi::mkl::sparse::gemv(gpuQueue_, + operation_, + alpha, + handle_, + x_device_, + beta, + y_device_, + {optimise}); + auto release = oneapi::mkl::sparse::release_matrix_handle(gpuQueue_, &handle_, {gemv}); + release.wait_and_throw(); + + handle_ = nullptr; // Reset handle to avoid double free + } catch (sycl::exception const& e) { + gpuQueue_.wait(); + oneapi::mkl::sparse::release_matrix_handle(gpuQueue_, &handle_); + std::cerr << "ERROR - Caught synchronous SYCL exception during SpMDnV (Once):\n" << e.what() << std::endl << "OpenCL status: " << e.code().value() << std::endl; + } catch (std::exception const &e) { + std::cerr << "\t\tCaught std exception:\n" << e.what() << std::endl; + gpuQueue_.wait(); + oneapi::mkl::sparse::release_matrix_handle(gpuQueue_, &handle_); + exit(1); + } + break; + } + case gpuOffloadType::unified: { + try { + std::vector int_ptr_vec; + int_ptr_vec.push_back(A_cols_); + int_ptr_vec.push_back(A_rows_); + std::vector float_ptr_vec; + float_ptr_vec.push_back(A_vals_); + float_ptr_vec.push_back(x_); + float_ptr_vec.push_back(y_); + + + handle_ = nullptr; + oneapi::mkl::sparse::init_matrix_handle(&handle_); + if (handle_ == nullptr) { + std::cerr << "ERROR - Failed to initialise matrix handle" << std::endl; + exit(1); + } + gpuQueue_.wait_and_throw(); + + auto set = oneapi::mkl::sparse::set_csr_data(gpuQueue_, + handle_, + m_, + n_, + index_, + A_rows_, + A_cols_, + A_vals_, + {}); + + auto optimise = oneapi::mkl::sparse::optimize_gemv(gpuQueue_, + operation_, + handle_, + {set}); + + auto gemv = oneapi::mkl::sparse::gemv(gpuQueue_, + operation_, + alpha, + handle_, + x_, + beta, + y_, + {optimise}); + + auto release = oneapi::mkl::sparse::release_matrix_handle(gpuQueue_, &handle_, {gemv}); + release.wait_and_throw(); + + handle_ = nullptr; // Reset handle to avoid double free + } catch (sycl::exception const& e) { + gpuQueue_.wait(); + oneapi::mkl::sparse::release_matrix_handle(gpuQueue_, &handle_); + std::cerr << "ERROR - Caught synchronous SYCL exception during SpMDnV (Once):\n" << e.what() << std::endl << "OpenCL status: " << e.code().value() << std::endl; + } catch (std::exception const &e) { + std::cerr << "\t\tCaught std exception:\n" << e.what() << std::endl; + gpuQueue_.wait(); + oneapi::mkl::sparse::release_matrix_handle(gpuQueue_, &handle_).wait(); + exit(1); + } + break; + } + } + } + + void postLoopRequirements() override { + if (offload_ == gpuOffloadType::once) { + gpuQueue_.memcpy(y_, y_device_, sizeof(T) * m_); + gpuQueue_.wait_and_throw(); + } + } + + void postCallKernelCleanup() override { + switch (offload_) { + case gpuOffloadType::always: + case gpuOffloadType::once: { + if (A_vals_ != nullptr) { + sycl::free(A_vals_, context_); + A_vals_ = nullptr; + } + if (A_cols_ != nullptr) { + sycl::free(A_cols_, context_); + A_cols_ = nullptr; + } + if (A_rows_ != nullptr) { + sycl::free(A_rows_, context_); + A_rows_ = nullptr; + } + if (A_vals_device_ != nullptr) { + sycl::free(A_vals_device_, context_); + A_vals_device_ = nullptr; + } + if (A_cols_device_ != nullptr) { + sycl::free(A_cols_device_, context_); + A_cols_device_ = nullptr; + } + if (A_rows_device_ != nullptr) { + sycl::free(A_rows_device_, context_); + A_rows_device_ = nullptr; + } + if (x_ != nullptr) { + sycl::free(x_, context_); + x_ = nullptr; + } + if (y_ != nullptr) { + sycl::free(y_, context_); + y_ = nullptr; + } + if (x_device_ != nullptr) { + sycl::free(x_device_, context_); + x_device_ = nullptr; + } + if (y_device_ != nullptr) { + sycl::free(y_device_, context_); + y_device_ = nullptr; + } + } + case gpuOffloadType::unified: { + if (A_vals_ != nullptr) { + sycl::free(A_vals_, context_); + A_vals_ = nullptr; + } + if (A_cols_ != nullptr) { + sycl::free(A_cols_, context_); + A_cols_ = nullptr; + } + if (A_rows_ != nullptr) { + sycl::free(A_rows_, context_); + A_rows_ = nullptr; + } + if (x_ != nullptr) { + sycl::free(x_, context_); + x_ = nullptr; + } + if (y_ != nullptr) { + sycl::free(y_, context_); + y_ = nullptr; + } + break; + } + } + if (offload_ == gpuOffloadType::unified) { + free(A_vals_store_); + free(A_cols_store_); + free(A_rows_store_); + } + gpuQueue_.wait_and_throw(); + } + + /** Whether the initialise function has been called before. */ + bool alreadyInitialised_ = false; + + /** The GPU Device. */ + sycl::device myGpu_; + + /** The SYCL execution queue*/ + sycl::queue gpuQueue_; + + sycl::context context_; + + oneapi::mkl::index_base index_; + oneapi::mkl::transpose operation_; + + T* A_vals_store_ = nullptr; + int64_t* A_cols_store_ = nullptr; + int64_t* A_rows_store_ = nullptr; + + T* A_vals_ = nullptr; + int64_t* A_cols_ = nullptr; + int64_t* A_rows_ = nullptr; + + oneapi::mkl::sparse::matrix_handle_t handle_ = nullptr; + + T* A_vals_device_ = nullptr; + int64_t* A_cols_device_ = nullptr; + int64_t* A_rows_device_ = nullptr; + T* x_device_ = nullptr; + T* y_device_ = nullptr; + + const T alpha = ALPHA; + const T beta = BETA; +}; +} + +#endif diff --git a/oneMKL/GPU/spmspm.hh b/oneMKL/GPU/spmspm.hh new file mode 100644 index 0000000..1bdec86 --- /dev/null +++ b/oneMKL/GPU/spmspm.hh @@ -0,0 +1,870 @@ +#pragma once + +#ifdef GPU_ONEMKL + +#include +#include "../../include/kernels/GPU/spmspm.hh" +#include "../../include/utilities.hh" +#include "common.hh" + +namespace gpu { +template +class spmspm_gpu : public spmspm { +public: + using spmspm::spmspm; + using spmspm::initInputMatrices; + using spmspm::A_nnz_; + using spmspm::B_nnz_; + using spmspm::C_nnz_; + using spmspm::m_; + using spmspm::n_; + using spmspm::k_; + using spmspm::C_rows_; + using spmspm::C_cols_; + using spmspm::C_vals_; + using spmspm::offload_; + using spmspm::sparsity_; + using spmspm::type_; + + void initialise(gpuOffloadType offload, int m, int n, int k, + double sparsity, matrixType type, + bool binary = false) override { + firstRun_ = true; + if (!initialised_) { + // Set up the sycl parameters + device_ = sycl::device(sycl::gpu_selector_v); + queue_ = sycl::queue(device_, exception_handler); + context_ = queue_.get_context(); + auto dev = queue_.get_device(); + initialised_ = true; + } + + // Storing initialise parameters into global variables + m_ = m; + n_ = n; + k_ = k; + sparsity_ = sparsity; + type_ = type; + offload_ = offload; + + // Calculating starting matrix NNZ values + A_nnz_ = 1 + (uint64_t)((double)m_ * (double)k_ * (1.0 - sparsity_)); + B_nnz_ = 1 + (uint64_t)((double)k_ * (double)n_ * (1.0 - sparsity_)); + + switch (offload_) { + case gpuOffloadType::always: { + A_rows_store_ = (int64_t*)malloc(static_cast(m_ + 1) * sizeof(int64_t)); + A_cols_store_ = (int64_t*)malloc(static_cast(A_nnz_) * sizeof(int64_t)); + A_vals_store_ = (T*)malloc(static_cast(A_nnz_) * sizeof(T)); + B_rows_store_ = (int64_t*)malloc(static_cast(k_ + 1) * sizeof(int64_t)); + B_cols_store_ = (int64_t*)malloc(static_cast(B_nnz_) * sizeof(int64_t)); + B_vals_store_ = (T*)malloc(static_cast(B_nnz_) * sizeof(T)); + + A_rows_ = sycl::malloc_host(static_cast(m_ + 1), queue_); + A_cols_ = sycl::malloc_host(static_cast(A_nnz_), queue_); + A_vals_ = sycl::malloc_host(static_cast(A_nnz_), queue_); + A_rows_device_ = sycl::malloc_device(static_cast(m_ + 1), queue_); + A_cols_device_ = sycl::malloc_device(static_cast(A_nnz_), queue_); + A_vals_device_ = sycl::malloc_device(static_cast(A_nnz_), queue_); + + B_rows_ = sycl::malloc_host(static_cast(k_ + 1), queue_); + B_cols_ = sycl::malloc_host(static_cast(B_nnz_), queue_); + B_vals_ = sycl::malloc_host(static_cast(B_nnz_), queue_); + B_rows_device_ = sycl::malloc_device(static_cast(k_ + 1), queue_); + B_cols_device_ = sycl::malloc_device(static_cast(B_nnz_), queue_); + B_vals_device_ = sycl::malloc_device(static_cast(B_nnz_), queue_); + + C_rows_ = nullptr; + C_cols_ = nullptr; + C_vals_ = nullptr; + C_rows_device_ = nullptr; + C_cols_device_ = nullptr; + C_vals_device_ = nullptr; + break; + } + case gpuOffloadType::once: { + A_rows_ = sycl::malloc_host(static_cast(m_ + 1), queue_); + A_cols_ = sycl::malloc_host(static_cast(A_nnz_), queue_); + A_vals_ = sycl::malloc_host(static_cast(A_nnz_), queue_); + A_rows_device_ = sycl::malloc_device(static_cast(m_ + 1), queue_); + A_cols_device_ = sycl::malloc_device(static_cast(A_nnz_), queue_); + A_vals_device_ = sycl::malloc_device(static_cast(A_nnz_), queue_); + + B_rows_ = sycl::malloc_host(static_cast(k_ + 1), queue_); + B_cols_ = sycl::malloc_host(static_cast(B_nnz_), queue_); + B_vals_ = sycl::malloc_host(static_cast(B_nnz_), queue_); + B_rows_device_ = sycl::malloc_device(static_cast(k_ + 1), queue_); + B_cols_device_ = sycl::malloc_device(static_cast(B_nnz_), queue_); + B_vals_device_ = sycl::malloc_device(static_cast(B_nnz_), queue_); + + C_rows_ = nullptr; + C_cols_ = nullptr; + C_vals_ = nullptr; + C_rows_device_ = nullptr; + C_cols_device_ = nullptr; + C_vals_device_ = nullptr; + break; + } + case gpuOffloadType::unified: { + A_rows_ = sycl::malloc_shared(static_cast(m_ + 1), queue_); + A_cols_ = sycl::malloc_shared(static_cast(A_nnz_), queue_); + A_vals_ = sycl::malloc_shared(static_cast(A_nnz_), queue_); + + B_rows_ = sycl::malloc_shared(static_cast(k_ + 1), queue_); + B_cols_ = sycl::malloc_shared(static_cast(B_nnz_), queue_); + B_vals_ = sycl::malloc_shared(static_cast(B_nnz_), queue_); + + C_rows_ = nullptr; + C_cols_ = nullptr; + C_vals_ = nullptr; + break; + } + } + queue_.wait_and_throw(); + initInputMatrices(); + } + +protected: + void toSparseFormat() override { + if (offload_ == gpuOffloadType::always) { + int seedOffset = 0; + do { + if (type_ == matrixType::rmat) { + rMatCSR(A_vals_store_, A_cols_store_, A_rows_store_, m_, k_, A_nnz_, SEED + seedOffset++); + rMatCSR(B_vals_store_, B_cols_store_, B_rows_store_, k_, n_, B_nnz_, SEED + seedOffset++); + } else if (type_ == matrixType::random) { + randomCSR(A_vals_store_, A_cols_store_, A_rows_store_, m_, k_, A_nnz_, SEED + seedOffset++); + randomCSR(B_vals_store_, B_cols_store_, B_rows_store_, k_, n_, B_nnz_, SEED + seedOffset++); + } else if (type_ == matrixType::finiteElements) { + finiteElementCSR(A_vals_store_, A_cols_store_, A_rows_store_, m_, k_, A_nnz_, SEED + seedOffset++); + finiteElementCSR(B_vals_store_, B_cols_store_, B_rows_store_, k_, n_, B_nnz_, SEED + seedOffset++); + } else { + std::cerr << "Unknown matrix type" << std::endl; + exit(1); + } + } while (calcCNNZ(m_, A_nnz_, A_rows_store_, A_cols_store_, k_, B_nnz_, B_rows_store_, B_cols_store_) == 0); + } + + memcpy(A_rows_, A_rows_store_, static_cast(m_ + 1) * sizeof(int64_t)); + memcpy(A_cols_, A_cols_store_, static_cast(A_nnz_) * sizeof(int64_t)); + memcpy(A_vals_, A_vals_store_, static_cast(A_nnz_) * sizeof(T)); + memcpy(B_rows_, B_rows_store_, static_cast(k_ + 1) * sizeof(int64_t)); + memcpy(B_cols_, B_cols_store_, static_cast(B_nnz_) * sizeof(int64_t)); + memcpy(B_vals_, B_vals_store_, static_cast(B_nnz_) * sizeof(T)); + } + +private: + void preLoopRequirements() override { + switch (offload_) { + case gpuOffloadType::always: { + // Nothing to do, does it all in the callSpmspm loop + break; + } + case gpuOffloadType::once: { + auto ARows = queue_.copy(A_rows_, A_rows_device_, static_cast(m_ + 1)); + auto ACols = queue_.copy(A_cols_, A_cols_device_, static_cast(A_nnz_)); + auto AVals = queue_.copy(A_vals_, A_vals_device_, static_cast(A_nnz_)); + + auto BRows = queue_.copy(B_rows_, B_rows_device_, static_cast(k_ + 1)); + auto BCols = queue_.copy(B_cols_, B_cols_device_, static_cast(B_nnz_)); + auto BVals = queue_.copy(B_vals_, B_vals_device_, static_cast(B_nnz_)); + + ARows.wait(); + ACols.wait(); + AVals.wait(); + BRows.wait(); + BCols.wait(); + BVals.wait(); + break; + } + case gpuOffloadType::unified: { + // Nothing to do here as shared memory + break; + } + } + } + + void callSpmspm() override { + switch (offload_) { + case gpuOffloadType::always: { + if (!firstRun_) { + sycl::free(C_rows_, queue_); + sycl::free(C_cols_, queue_); + sycl::free(C_vals_, queue_); + } + + auto ARows = queue_.copy(A_rows_, A_rows_device_, static_cast(m_ + 1)); + auto ACols = queue_.copy(A_cols_, A_cols_device_, static_cast(A_nnz_)); + auto AVals = queue_.copy(A_vals_, A_vals_device_, static_cast(A_nnz_)); + + auto BRows = queue_.copy(B_rows_, B_rows_device_, static_cast(k_ + 1)); + auto BCols = queue_.copy(B_cols_, B_cols_device_, static_cast(B_nnz_)); + auto BVals = queue_.copy(B_vals_, B_vals_device_, static_cast(B_nnz_)); + + C_rows_device_ = sycl::malloc_device(static_cast(m_ + 1), queue_); + + try { + oneapi::mkl::sparse::init_matrix_handle(&A_handle_); + oneapi::mkl::sparse::init_matrix_handle(&B_handle_); + oneapi::mkl::sparse::init_matrix_handle(&C_handle_); + + auto setA = oneapi::mkl::sparse::set_csr_data(queue_, + A_handle_, + m_, + k_, + AIndex_, + A_rows_device_, + A_cols_device_, + A_vals_device_, + {ARows, ACols, AVals}); + auto setB = oneapi::mkl::sparse::set_csr_data(queue_, + B_handle_, + k_, + n_, + BIndex_, + B_rows_device_, + B_cols_device_, + B_vals_device_, + {BRows, BCols, BVals}); + auto setC = oneapi::mkl::sparse::set_csr_data(queue_, + C_handle_, + m_, + n_, + CIndex_, + C_rows_device_, + (int64_t*)nullptr, + (T*)nullptr, + {}); + + oneapi::mkl::sparse::init_matmat_descr(&description_); + + oneapi::mkl::sparse::set_matmat_data(description_, + viewA_, + opA_, + viewB_, + opB_, + viewC_); + + request_ = oneapi::mkl::sparse::matmat_request::get_work_estimation_buf_size; + sizeTempBuffer = sycl::malloc_host(1, queue_); + + auto ev1_1 = oneapi::mkl::sparse::matmat(queue_, + A_handle_, + B_handle_, + C_handle_, + request_, + description_, + sizeTempBuffer, + nullptr, + {setA, setB, setC}); + ev1_1.wait(); + + tempBuffer = sycl::malloc_device(sizeTempBuffer[0], queue_); + + request_ = oneapi::mkl::sparse::matmat_request::work_estimation; + auto ev1_3 = oneapi::mkl::sparse::matmat(queue_, + A_handle_, + B_handle_, + C_handle_, + request_, + description_, + sizeTempBuffer, + tempBuffer, + {ev1_1}); + + request_ = oneapi::mkl::sparse::matmat_request::get_compute_buf_size; + + sizeTempBuffer2 = sycl::malloc_host(1, queue_); + + auto ev2_1 = oneapi::mkl::sparse::matmat(queue_, + A_handle_, + B_handle_, + C_handle_, + request_, + description_, + sizeTempBuffer2, + nullptr, + {ev1_3}); + ev2_1.wait(); + + tempBuffer2 = sycl::malloc_device(sizeTempBuffer2[0], queue_); + + request_ = oneapi::mkl::sparse::matmat_request::compute; + auto ev2_3 = oneapi::mkl::sparse::matmat(queue_, + A_handle_, + B_handle_, + C_handle_, + request_, + description_, + sizeTempBuffer2, + tempBuffer2, + {ev2_1}); + + request_ = oneapi::mkl::sparse::matmat_request::get_nnz; + + cNnzBuffer = sycl::malloc_host(1, queue_); + + auto ev3_1 = oneapi::mkl::sparse::matmat(queue_, + A_handle_, + B_handle_, + C_handle_, + request_, + description_, + cNnzBuffer, + nullptr, + {ev2_3}); + ev3_1.wait_and_throw(); + + C_nnz_ = cNnzBuffer[0]; + C_cols_device_ = sycl::malloc_device(static_cast(C_nnz_), queue_); + C_vals_device_ = sycl::malloc_device(static_cast(C_nnz_), queue_); + + setC = oneapi::mkl::sparse::set_csr_data(queue_, + C_handle_, + m_, + n_, + CIndex_, + C_rows_device_, + C_cols_device_, + C_vals_device_, + {ev3_1}); + + request_ = oneapi::mkl::sparse::matmat_request::finalize; + auto ev3_3 = oneapi::mkl::sparse::matmat(queue_, + A_handle_, + B_handle_, + C_handle_, + request_, + description_, + nullptr, + nullptr, + {setC}); + + auto ev_sort = oneapi::mkl::sparse::sort_matrix(queue_, C_handle_, {ev3_3}); + + C_rows_ = sycl::malloc_host(static_cast(m_ + 1), queue_); + C_cols_ = sycl::malloc_host(static_cast(C_nnz_), queue_); + C_vals_ = sycl::malloc_host(static_cast(C_nnz_), queue_); + + auto CRows = queue_.copy(C_rows_device_, C_rows_, static_cast(m_ + 1)); + auto CCols = queue_.copy(C_cols_device_, C_cols_, static_cast(C_nnz_)); + auto CVals = queue_.copy(C_vals_device_, C_vals_, static_cast(C_nnz_)); + CRows.wait(); + CCols.wait(); + CVals.wait(); + + oneapi::mkl::sparse::release_matmat_descr(&description_); + oneapi::mkl::sparse::release_matrix_handle(queue_, &A_handle_).wait(); + oneapi::mkl::sparse::release_matrix_handle(queue_, &B_handle_).wait(); + oneapi::mkl::sparse::release_matrix_handle(queue_, &C_handle_).wait(); + } catch (sycl::exception const &e) { + std::cerr << "\t\tCaught synchronous SYCL exception:\n" << e.what() << std::endl; + queue_.wait(); + oneapi::mkl::sparse::release_matmat_descr(&description_); + oneapi::mkl::sparse::release_matrix_handle(queue_, &A_handle_).wait(); + oneapi::mkl::sparse::release_matrix_handle(queue_, &B_handle_).wait(); + oneapi::mkl::sparse::release_matrix_handle(queue_, &C_handle_).wait(); + } + sycl::free(sizeTempBuffer, queue_); + sycl::free(sizeTempBuffer2, queue_); + sycl::free(tempBuffer, queue_); + sycl::free(tempBuffer2, queue_); + sycl::free(cNnzBuffer, queue_); + sycl::free(C_rows_device_, queue_); + sycl::free(C_cols_device_, queue_); + sycl::free(C_vals_device_, queue_); + break; + } + case gpuOffloadType::once: { + // If already allocated, free the device C arrays + if (!firstRun_) { + sycl::free(C_rows_device_, queue_); + sycl::free(C_cols_device_, queue_); + sycl::free(C_vals_device_, queue_); + } + + C_rows_device_ = sycl::malloc_device(static_cast(m_ + 1), queue_); + + oneapi::mkl::sparse::init_matrix_handle(&A_handle_); + oneapi::mkl::sparse::init_matrix_handle(&B_handle_); + oneapi::mkl::sparse::init_matrix_handle(&C_handle_); + + auto setA = oneapi::mkl::sparse::set_csr_data(queue_, + A_handle_, + m_, + k_, + AIndex_, + A_rows_device_, + A_cols_device_, + A_vals_device_, + {}); + auto setB = oneapi::mkl::sparse::set_csr_data(queue_, + B_handle_, + k_, + n_, + BIndex_, + B_rows_device_, + B_cols_device_, + B_vals_device_, + {}); + auto setC = oneapi::mkl::sparse::set_csr_data(queue_, + C_handle_, + m_, + n_, + CIndex_, + C_rows_device_, + (int64_t*)nullptr, + (T*)nullptr, + {}); + + oneapi::mkl::sparse::init_matmat_descr(&description_); + + oneapi::mkl::sparse::set_matmat_data(description_, + viewA_, + opA_, + viewB_, + opB_, + viewC_); + + request_ = oneapi::mkl::sparse::matmat_request::get_work_estimation_buf_size; + sizeTempBuffer = sycl::malloc_host(1, queue_); + auto ev1_1 = oneapi::mkl::sparse::matmat(queue_, + A_handle_, + B_handle_, + C_handle_, + request_, + description_, + sizeTempBuffer, + nullptr, + {setA, setB, setC}); + ev1_1.wait(); + + tempBuffer = sycl::malloc_device(sizeTempBuffer[0], queue_); + if (!tempBuffer) throw std::runtime_error("Could not allocate memory"); + + request_ = oneapi::mkl::sparse::matmat_request::work_estimation; + auto ev1_3 = oneapi::mkl::sparse::matmat(queue_, + A_handle_, + B_handle_, + C_handle_, + request_, + description_, + sizeTempBuffer, + tempBuffer, + {ev1_1}); + + request_ = oneapi::mkl::sparse::matmat_request::get_compute_buf_size; + sizeTempBuffer2 = sycl::malloc_host(1, queue_); + if (!sizeTempBuffer2) throw std::runtime_error("Could not allocate memory"); + auto ev2_1 = oneapi::mkl::sparse::matmat(queue_, + A_handle_, + B_handle_, + C_handle_, + request_, + description_, + sizeTempBuffer2, + nullptr, + {ev1_3}); + ev2_1.wait(); + + tempBuffer2 = sycl::malloc_device(sizeTempBuffer2[0], queue_); + if (!tempBuffer2) throw std::runtime_error("Could not allocate memory"); + + request_ = oneapi::mkl::sparse::matmat_request::compute; + auto ev2_3 = oneapi::mkl::sparse::matmat(queue_, + A_handle_, + B_handle_, + C_handle_, + request_, + description_, + sizeTempBuffer2, + tempBuffer2, + {ev2_1}); + + request_ = oneapi::mkl::sparse::matmat_request::get_nnz; + cNnzBuffer = sycl::malloc_host(1, queue_); + if (!cNnzBuffer) throw std::runtime_error("Could not allocate memory"); + auto ev3_1 = oneapi::mkl::sparse::matmat(queue_, + A_handle_, + B_handle_, + C_handle_, + request_, + description_, + cNnzBuffer, + nullptr, + {ev2_3}); + ev3_1.wait(); + + C_nnz_ = cNnzBuffer[0]; + C_cols_device_ = sycl::malloc_device(static_cast(C_nnz_), queue_); + C_vals_device_ = sycl::malloc_device(static_cast(C_nnz_), queue_); + + setC = oneapi::mkl::sparse::set_csr_data(queue_, + C_handle_, + m_, + n_, + CIndex_, + C_rows_device_, + C_cols_device_, + C_vals_device_, + {ev3_1}); + + request_ = oneapi::mkl::sparse::matmat_request::finalize; + auto ev3_3 = oneapi::mkl::sparse::matmat(queue_, + A_handle_, + B_handle_, + C_handle_, + request_, + description_, + nullptr, + nullptr, + {setC}); + + auto ev_sort = oneapi::mkl::sparse::sort_matrix(queue_, C_handle_, {ev3_3}); + + oneapi::mkl::sparse::release_matmat_descr(&description_); + oneapi::mkl::sparse::release_matrix_handle(queue_, &A_handle_).wait(); + oneapi::mkl::sparse::release_matrix_handle(queue_, &B_handle_).wait(); + oneapi::mkl::sparse::release_matrix_handle(queue_, &C_handle_).wait(); + sycl::free(sizeTempBuffer, queue_); + sycl::free(sizeTempBuffer2, queue_); + sycl::free(tempBuffer, queue_); + sycl::free(tempBuffer2, queue_); + sycl::free(cNnzBuffer, queue_); + break; + } + case gpuOffloadType::unified: { + // If already allocated, free the device C arrays + if (!firstRun_) { + sycl::free(C_rows_, queue_); + sycl::free(C_cols_, queue_); + sycl::free(C_vals_, queue_); + } + + C_rows_ = sycl::malloc_shared(static_cast(m_ + 1), queue_); + + oneapi::mkl::sparse::init_matrix_handle(&A_handle_); + oneapi::mkl::sparse::init_matrix_handle(&B_handle_); + oneapi::mkl::sparse::init_matrix_handle(&C_handle_); + + auto setA = oneapi::mkl::sparse::set_csr_data(queue_, + A_handle_, + m_, + k_, + AIndex_, + A_rows_, + A_cols_, + A_vals_, + {}); + auto setB = oneapi::mkl::sparse::set_csr_data(queue_, + B_handle_, + k_, + n_, + BIndex_, + B_rows_, + B_cols_, + B_vals_, + {}); + auto setC = oneapi::mkl::sparse::set_csr_data(queue_, + C_handle_, + m_, + n_, + CIndex_, + C_rows_, + (int64_t*)nullptr, + (T*)nullptr, + {}); + + oneapi::mkl::sparse::init_matmat_descr(&description_); + + oneapi::mkl::sparse::set_matmat_data(description_, + viewA_, + opA_, + viewB_, + opB_, + viewC_); + + request_ = oneapi::mkl::sparse::matmat_request::get_work_estimation_buf_size; + sizeTempBuffer = sycl::malloc_host(1, queue_); + if (!sizeTempBuffer) throw std::runtime_error("Could not allocate memory"); + auto ev1_1 = oneapi::mkl::sparse::matmat(queue_, + A_handle_, + B_handle_, + C_handle_, + request_, + description_, + sizeTempBuffer, + nullptr, + {setA, setB, setC}); + ev1_1.wait(); + + tempBuffer = sycl::malloc_device(sizeTempBuffer[0], queue_); + if (!tempBuffer) throw std::runtime_error("Could not allocate memory"); + + request_ = oneapi::mkl::sparse::matmat_request::work_estimation; + auto ev1_3 = oneapi::mkl::sparse::matmat(queue_, + A_handle_, + B_handle_, + C_handle_, + request_, + description_, + sizeTempBuffer, + tempBuffer, + {ev1_1}); + + request_ = oneapi::mkl::sparse::matmat_request::get_compute_buf_size; + sizeTempBuffer2 = sycl::malloc_host(1, queue_); + if (!sizeTempBuffer2) throw std::runtime_error("Could not allocate memory"); + auto ev2_1 = oneapi::mkl::sparse::matmat(queue_, + A_handle_, + B_handle_, + C_handle_, + request_, + description_, + sizeTempBuffer2, + nullptr, + {ev1_3}); + ev2_1.wait(); + + tempBuffer2 = sycl::malloc_device(sizeTempBuffer2[0], queue_); + if (!tempBuffer2) throw std::runtime_error("Could not allocate memory"); + + request_ = oneapi::mkl::sparse::matmat_request::compute; + auto ev2_3 = oneapi::mkl::sparse::matmat(queue_, + A_handle_, + B_handle_, + C_handle_, + request_, + description_, + sizeTempBuffer2, + tempBuffer2, + {ev2_1}); + + request_ = oneapi::mkl::sparse::matmat_request::get_nnz; + cNnzBuffer = sycl::malloc_shared(1, queue_); + if (!cNnzBuffer) throw std::runtime_error("Could not allocate memory"); + auto ev3_1 = oneapi::mkl::sparse::matmat(queue_, + A_handle_, + B_handle_, + C_handle_, + request_, + description_, + cNnzBuffer, + nullptr, + {ev2_3}); + ev3_1.wait(); + + C_nnz_ = cNnzBuffer[0]; + C_cols_ = sycl::malloc_shared(static_cast(C_nnz_), queue_); + C_vals_ = sycl::malloc_shared(static_cast(C_nnz_), queue_); + if (!C_vals_) throw std::runtime_error("Could not allocate memory"); + + setC = oneapi::mkl::sparse::set_csr_data(queue_, + C_handle_, + m_, + n_, + CIndex_, + C_rows_, + C_cols_, + C_vals_, + {ev3_1}); + + request_ = oneapi::mkl::sparse::matmat_request::finalize; + auto ev3_3 = oneapi::mkl::sparse::matmat(queue_, + A_handle_, + B_handle_, + C_handle_, + request_, + description_, + nullptr, + nullptr, + {setC}); + + auto ev_sort = oneapi::mkl::sparse::sort_matrix(queue_, C_handle_, {ev3_3}); + + oneapi::mkl::sparse::release_matmat_descr(&description_); + oneapi::mkl::sparse::release_matrix_handle(queue_, &A_handle_).wait(); + oneapi::mkl::sparse::release_matrix_handle(queue_, &B_handle_).wait(); + oneapi::mkl::sparse::release_matrix_handle(queue_, &C_handle_).wait(); + sycl::free(sizeTempBuffer, queue_); + sycl::free(sizeTempBuffer2, queue_); + sycl::free(tempBuffer, queue_); + sycl::free(tempBuffer2, queue_); + sycl::free(cNnzBuffer, queue_); + break; + } + } + firstRun_ = false; + } + + void postLoopRequirements() override { + switch (offload_) { + case gpuOffloadType::always: { + break; + } + case gpuOffloadType::once: { + C_rows_ = sycl::malloc_host(static_cast(m_ + 1), queue_); + + C_cols_ = sycl::malloc_host(static_cast(C_nnz_), queue_); + + C_vals_ = sycl::malloc_host(static_cast(C_nnz_), queue_); + + auto CRows = queue_.copy(C_rows_device_, C_rows_, static_cast(m_ + 1)); + auto CCols = queue_.copy(C_cols_device_, C_cols_, static_cast(C_nnz_)); + auto CVals = queue_.copy(C_vals_device_, C_vals_, static_cast(C_nnz_)); + CRows.wait(); + CCols.wait(); + CVals.wait(); + + sycl::free(C_rows_device_, queue_); + sycl::free(C_cols_device_, queue_); + sycl::free(C_vals_device_, queue_); + break; + } + case gpuOffloadType::unified: { + break; + } + } + } + + void postCallKernelCleanup() override { + switch (offload_) { + case gpuOffloadType::always: { + sycl::free(A_rows_, queue_); + sycl::free(A_cols_, queue_); + sycl::free(A_vals_, queue_); + sycl::free(A_rows_device_, queue_); + sycl::free(A_cols_device_, queue_); + sycl::free(A_vals_device_, queue_); + + sycl::free(B_rows_, queue_); + sycl::free(B_cols_, queue_); + sycl::free(B_vals_, queue_); + sycl::free(B_rows_device_, queue_); + sycl::free(B_cols_device_, queue_); + sycl::free(B_vals_device_, queue_); + + sycl::free(C_rows_, queue_); + sycl::free(C_cols_, queue_); + sycl::free(C_vals_, queue_); + break; + } + case gpuOffloadType::once: { + sycl::free(A_rows_, queue_); + sycl::free(A_cols_, queue_); + sycl::free(A_vals_, queue_); + sycl::free(A_rows_device_, queue_); + sycl::free(A_cols_device_, queue_); + sycl::free(A_vals_device_, queue_); + + sycl::free(B_rows_, queue_); + sycl::free(B_cols_, queue_); + sycl::free(B_vals_, queue_); + sycl::free(B_rows_device_, queue_); + sycl::free(B_cols_device_, queue_); + sycl::free(B_vals_device_, queue_); + + sycl::free(C_rows_, queue_); + sycl::free(C_cols_, queue_); + sycl::free(C_vals_, queue_); + break; + } + case gpuOffloadType::unified: { + sycl::free(A_rows_, queue_); + sycl::free(A_cols_, queue_); + sycl::free(A_vals_, queue_); + sycl::free(B_rows_, queue_); + sycl::free(B_cols_, queue_); + sycl::free(B_vals_, queue_); + sycl::free(C_rows_, queue_); + sycl::free(C_cols_, queue_); + sycl::free(C_vals_, queue_); + + free(A_rows_store_); + free(A_cols_store_); + free(A_vals_store_); + free(B_rows_store_); + free(B_cols_store_); + free(B_vals_store_); + break; + } + } + } + + // First-run check to confirm whether to clean up old arrays or not + bool firstRun_ = true; + + bool initialised_ = false; + + // Sycl parameters + sycl::queue queue_; + sycl::device device_; + sycl::context context_; + + // oneMKL parameters + oneapi::mkl::transpose opA_ = oneapi::mkl::transpose::nontrans; + oneapi::mkl::transpose opB_ = oneapi::mkl::transpose::nontrans; + + oneapi::mkl::sparse::matrix_view_descr viewA_ = oneapi::mkl::sparse::matrix_view_descr::general; + oneapi::mkl::sparse::matrix_view_descr viewB_ = oneapi::mkl::sparse::matrix_view_descr::general; + oneapi::mkl::sparse::matrix_view_descr viewC_ = oneapi::mkl::sparse::matrix_view_descr::general; + + oneapi::mkl::index_base AIndex_ = oneapi::mkl::index_base::zero; + oneapi::mkl::index_base BIndex_ = oneapi::mkl::index_base::zero; + oneapi::mkl::index_base CIndex_ = oneapi::mkl::index_base::zero; + + oneapi::mkl::sparse::matrix_handle_t A_handle_ = nullptr; + oneapi::mkl::sparse::matrix_handle_t B_handle_ = nullptr; + oneapi::mkl::sparse::matrix_handle_t C_handle_ = nullptr; + + oneapi::mkl::sparse::matmat_descr_t description_ = nullptr; + oneapi::mkl::sparse::matmat_request request_; + + size_t alloc_sz = 0; + + // A CSR arrays + int64_t* A_rows_store_ = nullptr; + int64_t* A_cols_store_ = nullptr; + T* A_vals_store_ = nullptr; + // LOCAL + int64_t* A_rows_ = nullptr; + int64_t* A_cols_ = nullptr; + T* A_vals_ = nullptr; + // DEVICE + int64_t* A_rows_device_ = nullptr; + int64_t* A_cols_device_ = nullptr; + T* A_vals_device_ = nullptr; + + // B CSR arrays + int64_t* B_rows_store_ = nullptr; + int64_t* B_cols_store_ = nullptr; + T* B_vals_store_ = nullptr; + // LOCAL + int64_t* B_rows_ = nullptr; + int64_t* B_cols_ = nullptr; + T* B_vals_ = nullptr; + // DEVICE + int64_t* B_rows_device_ = nullptr; + int64_t* B_cols_device_ = nullptr; + T* B_vals_device_ = nullptr; + + // C CSR arrays + // LOCAL -- carried through from parent class -- needed externally for checksum + // DEVICE + int64_t* C_rows_device_ = nullptr; + int64_t* C_cols_device_ = nullptr; + T* C_vals_device_ = nullptr; + + // Temporary buffers + int64_t* sizeTempBuffer = nullptr; + int64_t* sizeTempBuffer2 = nullptr; + int64_t* cNnzBuffer = nullptr; + void* tempBuffer = nullptr; + void* tempBuffer2 = nullptr; + + const T alpha = ALPHA; + const T beta = BETA; +}; +} + +#endif diff --git a/rocBLAS/common.hh b/rocBLAS/common.hh index 01ea03a..78ef4a7 100644 --- a/rocBLAS/common.hh +++ b/rocBLAS/common.hh @@ -8,8 +8,10 @@ if (hipError_t e = (f); e != hipSuccess) { \ std::cout << "HIP error: " << __FILE__ << ":" << __LINE__ << ": " \ << hipGetErrorString(e) << std::endl; \ + std::cout << "[DEBUG] -- " << #f << std::endl; \ exit(1); \ } \ } while (false) -#endif \ No newline at end of file + +#endif \ No newline at end of file diff --git a/rocBLAS/gemv.hh b/rocBLAS/gemv.hh index e1e7a02..1a79b8e 100644 --- a/rocBLAS/gemv.hh +++ b/rocBLAS/gemv.hh @@ -43,10 +43,28 @@ class gemv_gpu : public gemv { * - Unified: Initialise data as unified memory; no data movement semantics * required */ void initialise(gpuOffloadType offload, int m, int n) override { + if (print_) { + switch (offload) { + case gpuOffloadType::always: { + std::cout << "=========== ALWAYS ===========" << std::endl; + break; + } + case gpuOffloadType::once: { + std::cout << "=========== ONCE ===========" << std::endl; + break; + } + case gpuOffloadType::unified: { + std::cout << "=========== UNIFIED ===========" << std::endl; + break; + } + } + } + if (!alreadyInitialised_) { alreadyInitialised_ = true; // Perform set-up which doesn't need to happen every problem size change. // Create a handle for rocBLAS + if (print_) std::cout << "Creating handle" << std::endl; rocblas_status status = rocblas_create_handle(&handle_); if (status != rocblas_status_success) { std::cout << "Failed to make rocBLAS handle: " << status << std::endl; @@ -54,7 +72,12 @@ class gemv_gpu : public gemv { } // Get device identifier + int count; + hipCheckError(hipGetDeviceCount(&count)); + if (print_) std::cout << "Number of devices: " << count << std::endl; + if (print_) std::cout << "Getting device ID" << std::endl; hipCheckError(hipGetDevice(&gpuDevice_)); + if (print_) std::cout << "Device ID: " << gpuDevice_ << std::endl; // Initialise 3 streams to asynchronously move data between host and // device @@ -63,6 +86,7 @@ class gemv_gpu : public gemv { hipCheckError(hipStreamCreate(&s3_)); // Enable passing alpha parameter from pointer to host memory + if (print_) std::cout << "Setting pointer mode to host" << std::endl; status = rocblas_set_pointer_mode(handle_, rocblas_pointer_mode_host); if (status != rocblas_status_success) { std::cout << "Failed to set rocBLAS pointer mode: " << status @@ -76,15 +100,18 @@ class gemv_gpu : public gemv { n_ = n; if (offload_ == gpuOffloadType::unified) { + if (print_) std::cout << "\tAllocating unified memory" << std::endl; hipCheckError(hipMallocManaged(&A_, sizeof(T) * m_ * n_)); hipCheckError(hipMallocManaged(&x_, sizeof(T) * n_)); hipCheckError(hipMallocManaged(&y_, sizeof(T) * m_)); } else { // Allocate matrices on host + if (print_) std::cout << "\tAllocating host memory" << std::endl; hipCheckError(hipHostMalloc((void**)&A_, sizeof(T) * m_ * n_)); hipCheckError(hipHostMalloc((void**)&x_, sizeof(T) * n_)); hipCheckError(hipHostMalloc((void**)&y_, sizeof(T) * m_)); // Allocate matrices on device + if (print_) std::cout << "\tAllocating device memory" << std::endl; hipCheckError(hipMalloc((void**)&A_device_, sizeof(T) * m_ * n_)); hipCheckError(hipMalloc((void**)&x_device_, sizeof(T) * n_)); hipCheckError(hipMalloc((void**)&y_device_, sizeof(T) * m_)); @@ -104,6 +131,7 @@ class gemv_gpu : public gemv { break; } case gpuOffloadType::once: { + if (print_) std::cout << "\tMoving data to GPU" << std::endl; // Offload input data from host to the device. hipCheckError(hipMemcpyAsync(A_device_, A_, sizeof(T) * m_ * n_, hipMemcpyHostToDevice, s1_)); @@ -114,9 +142,9 @@ class gemv_gpu : public gemv { break; } case gpuOffloadType::unified: { + if (print_) std::cout << "\tPrefetching data to GPU" << std::endl; // Prefetch input data to device - hipCheckError( - hipMemPrefetchAsync(A_, sizeof(T) * m_ * n_, gpuDevice_, s1_)); + hipCheckError(hipMemPrefetchAsync(A_, sizeof(T) * m_ * n_, gpuDevice_, s1_)); hipCheckError(hipMemPrefetchAsync(x_, sizeof(T) * n_, gpuDevice_, s2_)); hipCheckError(hipMemPrefetchAsync(y_, sizeof(T) * m_, gpuDevice_, s3_)); break; @@ -128,6 +156,7 @@ class gemv_gpu : public gemv { void callGemv() override { switch (offload_) { case gpuOffloadType::always: { + if (print_) std::cout << "\tMoving data to GPU" << std::endl; // Offload input data from host to the device. hipCheckError(hipMemcpyAsync(A_device_, A_, sizeof(T) * m_ * n_, hipMemcpyHostToDevice, s1_)); @@ -136,6 +165,7 @@ class gemv_gpu : public gemv { hipCheckError(hipMemcpyAsync(y_device_, y_, sizeof(T) * m_, hipMemcpyHostToDevice, s3_)); // Call rocBLAS GEMV kernel + if (print_) std::cout << "\tCalling rocBLAS GEMV kernel" << std::endl; if constexpr (std::is_same_v) { rocblas_status stat = rocblas_sgemv( handle_, transA_, m_, n_, &alpha, A_device_, std::max(1, m_), @@ -155,6 +185,7 @@ class gemv_gpu : public gemv { exit(1); } } + if (print_) std::cout << "\tMoving data to CPU" << std::endl; // Offload output data from device to host hipCheckError(hipMemcpyAsync(y_, y_device_, sizeof(T) * m_, hipMemcpyDeviceToHost, s3_)); @@ -164,6 +195,7 @@ class gemv_gpu : public gemv { } case gpuOffloadType::once: { // Call rocBLAS GEMV kernel + if (print_) std::cout << "\tCalling rocBLAS GEMV kernel" << std::endl; if constexpr (std::is_same_v) { rocblas_status stat = rocblas_sgemv( handle_, transA_, m_, n_, &alpha, A_device_, std::max(1, m_), @@ -187,6 +219,7 @@ class gemv_gpu : public gemv { } case gpuOffloadType::unified: { // Call rocBLAS GEMV kernel + if (print_) std::cout << "\tCalling rocBLAS GEMV kernel" << std::endl; if constexpr (std::is_same_v) { rocblas_status stat = rocblas_sgemv( handle_, transA_, m_, n_, &alpha, A_, std::max(1, m_), x_, @@ -220,6 +253,7 @@ class gemv_gpu : public gemv { break; } case gpuOffloadType::once: { + if (print_) std::cout << "\tMoving data to CPU" << std::endl; // Offload output data from device to host hipCheckError(hipMemcpyAsync(y_, y_device_, sizeof(T) * m_, hipMemcpyDeviceToHost, s3_)); @@ -228,6 +262,7 @@ class gemv_gpu : public gemv { break; } case gpuOffloadType::unified: { + if (print_) std::cout << "\tMoving data to CPU" << std::endl; // Ensure all output data resides on host once work has completed hipCheckError( hipMemPrefetchAsync(y_, sizeof(T) * m_, hipCpuDeviceId, s3_)); @@ -242,20 +277,25 @@ class gemv_gpu : public gemv { * after Kernel has been called. */ void postCallKernelCleanup() override { if (offload_ == gpuOffloadType::unified) { + if (print_) std::cout << "\tFreeing unified memory arrays" << std::endl; hipCheckError(hipFree(A_)); hipCheckError(hipFree(x_)); hipCheckError(hipFree(y_)); } else { // Free the memory held on host and device + if (print_) std::cout << "\tFreeing host memory arrays" << std::endl; hipCheckError(hipHostFree((void*)A_)); hipCheckError(hipHostFree((void*)x_)); hipCheckError(hipHostFree((void*)y_)); + if (print_) std::cout << "\tFreeing device memory arrays" << std::endl; hipCheckError(hipFree(A_device_)); hipCheckError(hipFree(x_device_)); hipCheckError(hipFree(y_device_)); } } + bool print_ = true; + /** Whether the initialise function has been called before. */ bool alreadyInitialised_ = false; diff --git a/rocBLAS/spmdnm.hh b/rocBLAS/spmdnm.hh new file mode 100644 index 0000000..c41828f --- /dev/null +++ b/rocBLAS/spmdnm.hh @@ -0,0 +1,690 @@ +#pragma once + +#ifdef GPU_ROCBLAS +#include +#include + +#include "../include/kernels/GPU/spmdnm.hh" +#include "../include/utilities.hh" +#include "common.hh" + +#include + +namespace gpu { +template +class spmdnm_gpu : public spmdnm { +public: + using spmdnm::spmdnm; + using spmdnm::initInputMatrices; + using spmdnm::nnz_; + using spmdnm::m_; + using spmdnm::n_; + using spmdnm::k_; + using spmdnm::A_; + using spmdnm::B_; + using spmdnm::C_; + using spmdnm::offload_; + using spmdnm::sparsity_; + + ~spmdnm_gpu() { + if (initialised_) { + rocsparse_destroy_handle(handle_); + hipCheckError(hipStreamDestroy(s1_)); + hipCheckError(hipStreamDestroy(s2_)); + hipCheckError(hipStreamDestroy(s3_)); + } + } + + void initialise(gpuOffloadType offload, int m, int n, int k, + double sparsity, bool binary = false) override { + // Set up problem parameters + if (print_) { + switch (offload) { + case gpuOffloadType::always: { + std::cout << "=========== ALWAYS ===========" << std::endl; + break; + } + case gpuOffloadType::once: { + std::cout << "=========== ONCE ===========" << std::endl; + break; + } + case gpuOffloadType::unified: { + std::cout << "=========== UNIFIED ===========" << std::endl; + break; + } + } + } + if (print_) std::cout << "Initialising " << m << "x" << k << " . " << k << "x" << n << std::endl; + m_ = m; + n_ = n; + k_ = k; + sparsity_ = sparsity; + offload_ = offload; + nnz_ = 1 + (int64_t)((double)m_ * (double)n_ * (1.0 - sparsity_)); + + // Set up rocSPARSE metadata + base_ = rocsparse_index_base_zero; + type_ = rocsparse_matrix_type_general; + operation_ = rocsparse_operation_none; + index_ = rocsparse_indextype_i64; + order_ = rocsparse_order_column; + algorithm_ = rocsparse_spmm_alg_csr_nnz_split; // This is the only algo for this one + + if constexpr (std::is_same_v) { + dataType_ = rocsparse_datatype_f32_r; + } else if constexpr (std::is_same_v) { + dataType_ = rocsparse_datatype_f64_r; + } else { + throw std::runtime_error("Unsupported data type for spmdnm_gpu"); + } + + if (print_) std::cout << "\tAbout to set up handle and hip streams" << std::endl; + if (!initialised_) { + status_ = rocsparse_create_handle(&handle_); + checkStatus("Failed rocsparse_create_handle"); + + // Get the GPU + hipCheckError(hipGetDevice(&gpuDevice_)); + // Make streams for asynchronous GPU comunication + hipCheckError(hipStreamCreate(&s1_)); + hipCheckError(hipStreamCreate(&s2_)); + hipCheckError(hipStreamCreate(&s3_)); + } + + if (print_) std::cout << "\tAbout to malloc arrays" << std::endl; + if (offload_ == gpuOffloadType::unified) { + hipCheckError(hipMallocManaged(&A_, sizeof(T) * m_ * k_)); + hipCheckError(hipMallocManaged(&A_rows_, sizeof(int64_t) * (m_ + 1))); + hipCheckError(hipMallocManaged(&A_cols_, sizeof(int64_t) * nnz_)); + hipCheckError(hipMallocManaged(&A_vals_, sizeof(T) * nnz_)); + hipCheckError(hipMallocManaged(&B_, sizeof(T) * k_ * n_)); + hipCheckError(hipMallocManaged(&C_, sizeof(T) * m_ * n_)); + } else { + // Host data structures + hipCheckError(hipHostMalloc((void**)&A_, sizeof(T) * m_ * k_)); + hipCheckError(hipHostMalloc((void**)&A_rows_, sizeof(int64_t) * (m_ + 1))); + hipCheckError(hipHostMalloc((void**)&A_cols_, sizeof(int64_t) * nnz_)); + hipCheckError(hipHostMalloc((void**)&A_vals_, sizeof(T) * nnz_)); + hipCheckError(hipHostMalloc((void**)&B_, sizeof(T) * k_ * n_)); + hipCheckError(hipHostMalloc((void**)&C_, sizeof(T) * m_ * n_)); + // GPU data structures + hipCheckError(hipMalloc((void**)&A_rows_device_, sizeof(int64_t) * (m_ + 1))); + hipCheckError(hipMalloc((void**)&A_cols_device_, sizeof(int64_t) * nnz_)); + hipCheckError(hipMalloc((void**)&A_vals_device_, sizeof(T) * nnz_)); + hipCheckError(hipMalloc((void**)&B_device_, sizeof(T) * k_ * n_)); + hipCheckError(hipMalloc((void**)&C_device_, sizeof(T) * m_ * n_)); + } + + if (print_) std::cout << "\tInitialising matrices" << std::endl; + initInputMatrices(); + } + + +protected: + void toSparseFormat() override { + int64_t nnz_encountered = 0; + + A_rows_[0] = 0; + + for (int64_t row = 0; row < m_; row++) { + for (int64_t col = 0; col < k_; col++) { + if (A_[(row * k_) + col] != 0.0) { + A_cols_[nnz_encountered] = col; + A_vals_[nnz_encountered] = static_cast(A_[(row * k_) + col]); + nnz_encountered++; + } + } + A_rows_[row + 1] = nnz_encountered; + } + } + +private: + void preLoopRequirements() override { + if (print_) std::cout << "pre-loop stuff" << std::endl; + switch (offload_) { + case gpuOffloadType::always: { + break; + } + case gpuOffloadType::once: { + if (print_) std::cout << "\tMoving data to GPU" << std::endl; + hipCheckError(hipMemcpyAsync(A_rows_device_, + A_rows_, + sizeof(int64_t) * (m_ + 1), + hipMemcpyHostToDevice, + s1_)); + hipCheckError(hipMemcpyAsync(A_cols_device_, + A_cols_, + sizeof(int64_t) * nnz_, + hipMemcpyHostToDevice, + s1_)); + hipCheckError(hipMemcpyAsync(A_vals_device_, + A_vals_, + sizeof(T) * nnz_, + hipMemcpyHostToDevice, + s1_)); + hipCheckError(hipMemcpyAsync(B_device_, + B_, + sizeof(T) * k_ * n_, + hipMemcpyHostToDevice, + s2_)); + hipCheckError(hipMemcpyAsync(C_device_, + C_, + sizeof(T) * m_ * n_, + hipMemcpyHostToDevice, + s3_)); + hipCheckError(hipDeviceSynchronize()); + break; + } + case gpuOffloadType::unified: { + if (print_) std::cout << "\tMoving data to GPU" << std::endl; + hipCheckError(hipMemPrefetchAsync(A_rows_, + sizeof(int64_t) * (m_ + 1), + gpuDevice_, + s1_)); + hipCheckError(hipMemPrefetchAsync(A_cols_, + sizeof(int64_t) * nnz_, + gpuDevice_, + s1_)); + hipCheckError(hipMemPrefetchAsync(A_vals_, + sizeof(T) * nnz_, + gpuDevice_, + s1_)); + hipCheckError(hipMemPrefetchAsync(B_, + sizeof(T) * k_ * n_, + gpuDevice_, + s2_)); + hipCheckError(hipMemPrefetchAsync(C_, + sizeof(T) * m_ * n_, + gpuDevice_, + s3_)); + hipCheckError(hipDeviceSynchronize()); + break; + } + } + } + + void callSpmdnm() override { + if (print_) std::cout << "callSpmdnm" << std::endl; + switch (offload_) { + case gpuOffloadType::always: { + if (print_) std::cout << "\tMoving data to GPU" << std::endl; + hipCheckError(hipMemcpyAsync(A_rows_device_, + A_rows_, + sizeof(int64_t) * (m_ + 1), + hipMemcpyHostToDevice, + s1_)); + hipCheckError(hipMemcpyAsync(A_cols_device_, + A_cols_, + sizeof(int64_t) * nnz_, + hipMemcpyHostToDevice, + s1_)); + hipCheckError(hipMemcpyAsync(A_vals_device_, + A_vals_, + sizeof(T) * nnz_, + hipMemcpyHostToDevice, + s1_)); + hipCheckError(hipMemcpyAsync(B_device_, + B_, + sizeof(T) * k_ * n_, + hipMemcpyHostToDevice, + s2_)); + hipCheckError(hipMemcpyAsync(C_device_, + C_, + sizeof(T) * m_ * n_, + hipMemcpyHostToDevice, + s3_)); + hipCheckError(hipDeviceSynchronize()); + + + if (print_) std::cout << "\tCreating rocSPARSE structures" << std::endl; + // Set up the rocSPARSE structures for the GEMV + status_ = rocsparse_create_csr_descr(&A_description_, m_, k_, nnz_, A_rows_device_, + A_cols_device_, A_vals_device_, index_, index_, + base_, dataType_); + checkStatus("Failed rocsparse_create_csr_descr for A"); + + status_ = rocsparse_create_dnmat_descr(&B_description_, k_, n_, k_, B_device_, + dataType_, order_); + checkStatus("Failed rocsparse_create_dnmat_descr for B"); + + status_ = rocsparse_create_dnmat_descr(&C_description_, m_, n_, m_, C_device_, + dataType_, order_); + checkStatus("Failed rocsparse_create_dnmat_descr for C"); + hipCheckError(hipDeviceSynchronize()); + + size_t buffer_size = 0; + status_ = rocsparse_spmm(handle_, + operation_, + operation_, + &alpha, + A_description_, + B_description_, + &beta, + C_description_, + dataType_, + algorithm_, + rocsparse_spmm_stage_buffer_size, + &buffer_size, + nullptr); + checkStatus("Failed rocsparse_spmm with stage=rocsparse_spmm_stage_buffer_size"); + + void* buffer = nullptr; + if (print_) std::cout << "\tAllocating buffer with buffer_size = " << buffer_size << std::endl; + if (buffer_size > 0) hipCheckError(hipMalloc(&buffer, buffer_size)); + + status_ = rocsparse_spmm(handle_, + operation_, + operation_, + &alpha, + A_description_, + B_description_, + &beta, + C_description_, + dataType_, + algorithm_, + rocsparse_spmm_stage_preprocess, + &buffer_size, + buffer); + checkStatus("Failed rocsparse_spmm with stage=rocsparse_spmm_stage_preprocess"); + + hipCheckError(hipDeviceSynchronize()); + status_ = rocsparse_spmm(handle_, + operation_, + operation_, + &alpha, + A_description_, + B_description_, + &beta, + C_description_, + dataType_, + algorithm_, + rocsparse_spmm_stage_compute, + &buffer_size, + buffer); + checkStatus("Failed rocsparse_spmm with stage=rocsparse_spmm_stage_compute"); + + hipCheckError(hipDeviceSynchronize()); + if (print_) std::cout << "\tdestroying rocSPARSE structures" << std::endl; + // Now clean up + status_ = rocsparse_destroy_spmat_descr(A_description_); + checkStatus("Failed rocsparse_destroy_spmat_descr for A"); + status_ = rocsparse_destroy_dnmat_descr(B_description_); + checkStatus("Failed rocsparse_destroy_dnmat_descr for B"); + status_ = rocsparse_destroy_dnmat_descr(C_description_); + checkStatus("Failed rocsparse_destroy_dnmat_descr for C"); + if (buffer != nullptr) hipCheckError(hipFree(buffer)); + hipCheckError(hipDeviceSynchronize()); + + // Move result back to the CPU + if (print_) std::cout << "\tMoving data to CPU" << std::endl; + hipCheckError(hipMemcpyAsync(C_, + C_device_, + sizeof(T) * m_ * n_, + hipMemcpyDeviceToHost, + s3_)); + hipCheckError(hipDeviceSynchronize()); + break; + } + case gpuOffloadType::once: { + if (print_) std::cout << "\tCreating rocSPARSE structures" << std::endl; + // Set up the rocSPARSE structures for the GEMV + status_ = rocsparse_create_csr_descr(&A_description_, + m_, + k_, + nnz_, + A_rows_device_, + A_cols_device_, + A_vals_device_, + index_, + index_, + base_, + dataType_); + checkStatus("Failed rocsparse_create_csr_descr for A"); + + status_ = rocsparse_create_dnmat_descr(&B_description_, + k_, + n_, + k_, + B_device_, + dataType_, + order_); + checkStatus("Failed rocsparse_create_dnmat_descr for B"); + + status_ = rocsparse_create_dnmat_descr(&C_description_, + m_, + n_, + m_, + C_device_, + dataType_, + order_); + checkStatus("Failed rocsparse_create_dnmat_descr for C"); + hipCheckError(hipDeviceSynchronize()); + + size_t buffer_size = 0; + status_ = rocsparse_spmm(handle_, + operation_, + operation_, + &alpha, + A_description_, + B_description_, + &beta, + C_description_, + dataType_, + algorithm_, + rocsparse_spmm_stage_buffer_size, + &buffer_size, + nullptr); + checkStatus("Failed rocsparse_spmm with stage=rocsparse_spmm_stage_buffer_size"); + + void* buffer = nullptr; + if (print_) std::cout << "\tAllocating buffer with buffer_size = " << buffer_size << std::endl; + if (buffer_size > 0) hipCheckError(hipMalloc(&buffer, buffer_size)); + + status_ = rocsparse_spmm(handle_, + operation_, + operation_, + &alpha, + A_description_, + B_description_, + &beta, + C_description_, + dataType_, + algorithm_, + rocsparse_spmm_stage_preprocess, + &buffer_size, + buffer); + checkStatus("Failed rocsparse_spmm with stage=rocsparse_spmm_stage_preprocess"); + + hipCheckError(hipDeviceSynchronize()); + status_ = rocsparse_spmm(handle_, + operation_, + operation_, + &alpha, + A_description_, + B_description_, + &beta, + C_description_, + dataType_, + algorithm_, + rocsparse_spmm_stage_compute, + &buffer_size, + buffer); + checkStatus("Failed rocsparse_spmm with stage=rocsparse_spmm_stage_compute"); + + hipCheckError(hipDeviceSynchronize()); + if (print_) std::cout << "\tdestroying rocSPARSE structures" << std::endl; + // Now clean up + status_ = rocsparse_destroy_spmat_descr(A_description_); + checkStatus("Failed rocsparse_destroy_spmat_descr for A"); + status_ = rocsparse_destroy_dnmat_descr(B_description_); + checkStatus("Failed rocsparse_destroy_dnmat_descr for B"); + status_ = rocsparse_destroy_dnmat_descr(C_description_); + checkStatus("Failed rocsparse_destroy_dnmat_descr for C"); + if (buffer) hipCheckError(hipFree(buffer)); + hipCheckError(hipDeviceSynchronize()); + break; + } + case gpuOffloadType::unified: { + if (print_) std::cout << "\tCreating rocSPARSE structures" << std::endl; + // Set up the rocSPARSE structures for the GEMV + status_ = rocsparse_create_csr_descr(&A_description_, + m_, + k_, + nnz_, + A_rows_, + A_cols_, + A_vals_, + index_, + index_, + base_, + dataType_); + checkStatus("Failed rocsparse_create_csr_descr for A"); + + status_ = rocsparse_create_dnmat_descr(&B_description_, + k_, + n_, + k_, + B_, + dataType_, + order_); + checkStatus("Failed rocsparse_create_dnmat_descr for B"); + + status_ = rocsparse_create_dnmat_descr(&C_description_, + m_, + n_, + m_, + C_, + dataType_, + order_); + checkStatus("Failed rocsparse_create_dnmat_descr for C"); + hipCheckError(hipDeviceSynchronize()); + + size_t buffer_size; + status_ = rocsparse_spmm(handle_, + operation_, + operation_, + &alpha, + A_description_, + B_description_, + &beta, + C_description_, + dataType_, + algorithm_, + rocsparse_spmm_stage_buffer_size, + &buffer_size, + nullptr); + checkStatus("Failed rocsparse_spmm with stage=rocsparse_spmm_stage_buffer_size"); + + void* buffer = nullptr; + if (print_) std::cout << "\tAllocating buffer with buffer_size = " << buffer_size << std::endl; + if (buffer_size > 0) hipCheckError(hipMallocManaged(&buffer, buffer_size)); + + status_ = rocsparse_spmm(handle_, + operation_, + operation_, + &alpha, + A_description_, + B_description_, + &beta, + C_description_, + dataType_, + algorithm_, + rocsparse_spmm_stage_preprocess, + &buffer_size, + buffer); + checkStatus("Failed rocsparse_spmm with stage=rocsparse_spmm_stage_preprocess"); + + hipCheckError(hipDeviceSynchronize()); + status_ = rocsparse_spmm(handle_, + operation_, + operation_, + &alpha, + A_description_, + B_description_, + &beta, + C_description_, + dataType_, + algorithm_, + rocsparse_spmm_stage_compute, + &buffer_size, + buffer); + checkStatus("Failed rocsparse_spmm with stage=rocsparse_spmm_stage_compute"); + + hipCheckError(hipDeviceSynchronize()); + if (print_) std::cout << "\tdestroying rocSPARSE structures" << std::endl; + // Now clean up + status_ = rocsparse_destroy_spmat_descr(A_description_); + checkStatus("Failed rocsparse_destroy_spmat_descr for A"); + status_ = rocsparse_destroy_dnmat_descr(B_description_); + checkStatus("Failed rocsparse_destroy_dnmat_descr for B"); + status_ = rocsparse_destroy_dnmat_descr(C_description_); + checkStatus("Failed rocsparse_destroy_dnmat_descr for C"); + if (buffer) hipCheckError(hipFree(buffer)); + hipCheckError(hipDeviceSynchronize()); + break; + } + } + } + + void postLoopRequirements() override { + switch (offload_) { + case gpuOffloadType::always: { + break; + } + case gpuOffloadType::once: { + // Move result back to the CPU + if (print_) std::cout << "\tMoving data to CPU" << std::endl; + hipCheckError(hipMemcpyAsync(C_, + C_device_, + sizeof(T) * m_ * n_, + hipMemcpyDeviceToHost, + s3_)); + hipCheckError(hipDeviceSynchronize()); + break; + } + case gpuOffloadType::unified: { + // Ensure all output data resides on host once work has completed + if (print_) std::cout << "\tMoving data to CPU" << std::endl; + hipCheckError(hipMemPrefetchAsync(C_, + sizeof(T) * m_ * n_, + hipCpuDeviceId, + s3_)); + // Ensure device has finished all work. + hipCheckError(hipDeviceSynchronize()); + break; + } + } + } + + void postCallKernelCleanup() override { + if (print_) std::cout << "Post-kernel cleanup" << std::endl; + if (offload_ == gpuOffloadType::unified) { + if (print_) std::cout << "\tFreeing unified arrays" << std::endl; + hipCheckError(hipFree(A_)); + hipCheckError(hipFree(A_rows_)); + hipCheckError(hipFree(A_cols_)); + hipCheckError(hipFree(A_vals_)); + hipCheckError(hipFree(B_)); + hipCheckError(hipFree(C_)); + } else { + if (print_) std::cout << "\tFreeing CPU arrays" << std::endl; + hipCheckError(hipHostFree((void*)A_)); + hipCheckError(hipHostFree((void*)A_rows_)); + hipCheckError(hipHostFree((void*)A_cols_)); + hipCheckError(hipHostFree((void*)A_vals_)); + hipCheckError(hipHostFree((void*)B_)); + hipCheckError(hipHostFree((void*)C_)); + + if (print_) std::cout << "\tFreeing GPU arrays" << std::endl; + hipCheckError(hipFree(A_rows_device_)); + hipCheckError(hipFree(A_cols_device_)); + hipCheckError(hipFree(A_vals_device_)); + hipCheckError(hipFree(B_device_)); + hipCheckError(hipFree(C_device_)); + } + } + + void checkStatus(std::string message) { + if (status_ != rocsparse_status_success) { + std::cerr << message << std::endl; + switch (status_) { + case rocsparse_status_success: { + std::cerr << "rocsparse_status_success" << std::endl; + break; + } + case rocsparse_status_invalid_handle: { + std::cerr << "rocsparse_status_invalid_handle" << std::endl; + break; + } + case rocsparse_status_not_implemented: { + std::cerr << "rocsparse_status_not_implemented" << std::endl; + break; + } + case rocsparse_status_invalid_pointer: { + std::cerr << "rocsparse_status_invalid_pointer" << std::endl; + break; + } + case rocsparse_status_invalid_size: { + std::cerr << "rocsparse_status_invalid_size" << std::endl; + break; + } + case rocsparse_status_memory_error: { + std::cerr << "rocsparse_status_memory_error" << std::endl; + break; + } + case rocsparse_status_internal_error: { + std::cerr << "rocsparse_status_internal_error" << std::endl; + break; + } + case rocsparse_status_invalid_value: { + std::cerr << "rocsparse_status_invalid_value" << std::endl; + break; + } + case rocsparse_status_arch_mismatch: { + std::cerr << "rocsparse_status_arch_mismatch" << std::endl; + break; + } + case rocsparse_status_zero_pivot: { + std::cerr << "rocsparse_status_zero_pivot" << std::endl; + break; + } + case rocsparse_status_not_initialized: { + std::cerr << "rocsparse_status_not_initialized" << std::endl; + break; + } + case rocsparse_status_type_mismatch: { + std::cerr << "rocsparse_status_type_mismatch" << std::endl; + break; + } + case rocsparse_status_requires_sorted_storage: { + std::cerr << "rocsparse_status_requires_sorted_storage" << std::endl; + break; + } + case rocsparse_status_thrown_exception: { + std::cerr << "rocsparse_status_thrown_exception" << std::endl; + break; + } + default: { + std::cerr << "Unknown status code: " << status_ << std::endl; + } + } + exit(1); + } + } + + bool initialised_ = false; + bool print_ = false; + + rocsparse_status status_; + rocsparse_operation operation_; + rocsparse_handle handle_; + rocsparse_index_base base_; + rocsparse_datatype dataType_; + rocsparse_matrix_type type_; + rocsparse_indextype index_; + rocsparse_spmm_alg algorithm_; + rocsparse_order order_; + + rocsparse_spmat_descr A_description_; + rocsparse_dnmat_descr B_description_; + rocsparse_dnmat_descr C_description_; + + int64_t* A_rows_; + int64_t* A_cols_; + T* A_vals_; + + int64_t* A_rows_device_; + int64_t* A_cols_device_; + T* A_vals_device_; + T* B_device_; + T* C_device_; + + int gpuDevice_; + hipStream_t s1_, s2_, s3_; + + const T alpha = ALPHA; + const T beta = BETA; +}; +} + +#endif diff --git a/rocBLAS/spmdnv.hh b/rocBLAS/spmdnv.hh new file mode 100644 index 0000000..a89932c --- /dev/null +++ b/rocBLAS/spmdnv.hh @@ -0,0 +1,600 @@ +#pragma once + +#ifdef GPU_ROCBLAS +#include +#include + +#include "../include/kernels/GPU/spmdnv.hh" +#include "../include/utilities.hh" +#include "common.hh" + +namespace gpu { +template +class spmdnv_gpu : public spmdnv { +public: + using spmdnv::spmdnv; + using spmdnv::initInputMatrixVector; + using spmdnv::nnz_; + using spmdnv::m_; + using spmdnv::n_; + using spmdnv::A_; + using spmdnv::x_; + using spmdnv::y_; + using spmdnv::offload_; + using spmdnv::sparsity_; + + ~spmdnv_gpu() { + if (initialised_) { + rocsparse_destroy_handle(handle_); + hipCheckError(hipStreamDestroy(s1_)); + hipCheckError(hipStreamDestroy(s2_)); + hipCheckError(hipStreamDestroy(s3_)); + } + } + + void initialise(gpuOffloadType offload, int m, int n, double sparsity) + override { + // Set up problem parameters + if (print_) { + switch (offload) { + case gpuOffloadType::always: { + std::cout << "=========== ALWAYS ===========" << std::endl; + break; + } + case gpuOffloadType::once: { + std::cout << "=========== ONCE ===========" << std::endl; + break; + } + case gpuOffloadType::unified: { + std::cout << "=========== UNIFIED ===========" << std::endl; + break; + } + } + } + if (print_) std::cout << "Initialising with matrix of " << m << "x" << n << std::endl; + m_ = m; + n_ = n; + sparsity_ = sparsity; + offload_ = offload; + + nnz_ = 1 + (uint64_t)((double)m_ * (double)n_ * (1.0 - sparsity_)); + + + // Set up rocSPARSE metadata + index_ = rocsparse_indextype_i64; + type_ = rocsparse_matrix_type_general; + operation_ = rocsparse_operation_none; + base_ = rocsparse_index_base_zero; + algorithm_ = rocsparse_spmv_alg_default; // There are a couple of CSR algorithms -- investigate which is best! + if constexpr (std::is_same_v) { + dataType_ = rocsparse_datatype_f32_r; + } else if constexpr (std::is_same_v) { + dataType_ = rocsparse_datatype_f64_r; + } else { + throw std::runtime_error("Unsupported data type for spmdnv_gpu"); + } + + + if (print_) std::cout << "\tAbout to set up handle and hip streams" << std::endl; + if (!initialised_) { + // Get the GPU + int count; + hipCheckError(hipGetDeviceCount(&count)); + if (print_) std::cout << "Number of devices: " << count << std::endl; + if (print_) std::cout << "Getting device ID" << std::endl; + if (print_) std::cout << "\t\tGetting GPU device" << std::endl; + hipCheckError(hipGetDevice(&gpuDevice_)); + if (print_) std::cout << "Device ID: " << gpuDevice_ << std::endl; + + // Make streams for asynchronous GPU comunication + if (print_) std::cout << "\t\tCreating GPU streams" << std::endl; + hipCheckError(hipStreamCreate(&s1_)); + hipCheckError(hipStreamCreate(&s2_)); + hipCheckError(hipStreamCreate(&s3_)); + + if (print_) std::cout << "\t\tSetting up GPU handle" << std::endl; + status_ = rocsparse_create_handle(&handle_); + checkStatus("Failed rocsparse_create_handle"); + } + + if (print_) std::cout << "\tAbout to malloc arrays" << std::endl; + if (offload_ == gpuOffloadType::unified) { + hipCheckError(hipMallocManaged(&A_, sizeof(T) * m_ * n_)); + hipCheckError(hipMallocManaged(&A_rows_, sizeof(int64_t) * (m_ + 1))); + hipCheckError(hipMallocManaged(&A_cols_, sizeof(int64_t) * nnz_)); + hipCheckError(hipMallocManaged(&A_vals_, sizeof(T) * nnz_)); + hipCheckError(hipMallocManaged(&x_, sizeof(T) * n_)); + hipCheckError(hipMallocManaged(&y_, sizeof(T) * m_)); + } else { + // Host data structures + hipCheckError(hipHostMalloc((void**)&A_, sizeof(T) * m_ * n_)); + hipCheckError(hipHostMalloc((void**)&A_rows_, sizeof(int64_t) * (m_ + 1))); + hipCheckError(hipHostMalloc((void**)&A_cols_, sizeof(int64_t) * nnz_)); + hipCheckError(hipHostMalloc((void**)&A_vals_, sizeof(T) * nnz_)); + hipCheckError(hipHostMalloc((void**)&x_, sizeof(T) * n_)); + hipCheckError(hipHostMalloc((void**)&y_, sizeof(T) * m_)); + // GPU data structures + hipCheckError(hipMalloc((void**)&A_rows_device_, sizeof(int64_t) * (m_ + 1))); + hipCheckError(hipMalloc((void**)&A_cols_device_, sizeof(int64_t) * nnz_)); + hipCheckError(hipMalloc((void**)&A_vals_device_, sizeof(T) * nnz_)); + hipCheckError(hipMalloc((void**)&x_device_, sizeof(T) * n_)); + hipCheckError(hipMalloc((void**)&y_device_, sizeof(T) * m_)); + } + + if (print_) std::cout << "\tInitialising matrix and vector" << std::endl; + initInputMatrixVector(); + } + + +protected: + void toSparseFormat() override { + if (print_) std::cout << "\tTo Sparse" << std::endl; + int64_t nnz_encountered = 0; + + A_rows_[0] = 0; + + for (int64_t row = 0; row < m_; row++) { + for (int64_t col = 0; col < n_; col++) { + if (A_[(row * n_) + col] != 0.0) { + A_cols_[nnz_encountered] = col; + A_vals_[nnz_encountered] = static_cast(A_[(row * n_) + col]); + nnz_encountered++; + } + } + A_rows_[row + 1] = nnz_encountered; + } + } + +private: + /** + * Before we enter the loop of calling the kernel, + * we need to move any data we may need. + */ + void preLoopRequirements() override { + if (print_) std::cout << "pre-loop stuff" << std::endl; + switch (offload_) { + case gpuOffloadType::always: { + // For Always there is nothing to do here, + // as all memory is moved each time the + // kernel is called + break; + } + case gpuOffloadType::once: { + if (print_) std::cout << "\tMoving data to GPU" << std::endl; + hipCheckError(hipMemcpyAsync(A_rows_device_, + A_rows_, + sizeof(int64_t) * (m_ + 1), + hipMemcpyHostToDevice, + s1_)); + hipCheckError(hipMemcpyAsync(A_cols_device_, + A_cols_, + sizeof(int64_t) * nnz_, + hipMemcpyHostToDevice, + s1_)); + hipCheckError(hipMemcpyAsync(A_vals_device_, + A_vals_, + sizeof(T) * nnz_, + hipMemcpyHostToDevice, + s1_)); + hipCheckError(hipMemcpyAsync(x_device_, + x_, + sizeof(T) * n_, + hipMemcpyHostToDevice, + s2_)); + hipCheckError(hipMemcpyAsync(y_device_, + y_, + sizeof(T) * m_, + hipMemcpyHostToDevice, + s3_)); + hipCheckError(hipDeviceSynchronize()); + break; + } + case gpuOffloadType::unified: { + if (print_) std::cout << "\tMoving data to GPU" << std::endl; + hipCheckError(hipMemPrefetchAsync(A_rows_, sizeof(int64_t) * (m_ + 1), gpuDevice_, s1_)); + hipCheckError(hipMemPrefetchAsync(A_cols_, sizeof(int64_t) * nnz_, gpuDevice_, s1_)); + hipCheckError(hipMemPrefetchAsync(A_vals_, sizeof(T) * nnz_, gpuDevice_, s1_)); + hipCheckError(hipMemPrefetchAsync(x_, sizeof(T) * n_, gpuDevice_, s2_)); + hipCheckError(hipMemPrefetchAsync(y_, sizeof(T) * m_, gpuDevice_, s3_)); + hipCheckError(hipDeviceSynchronize()); + break; + } + } + } + + void callSpMDnV() override { + if (print_) std::cout << "callSpMDnV" << std::endl; + switch (offload_) { + case gpuOffloadType::always: { + // Start by moving all the data over to the GPU + if (print_) std::cout << "\tMoving data to GPU" << std::endl; + hipCheckError(hipMemcpyAsync(A_rows_device_, + A_rows_, + sizeof(int64_t) * (m_ + 1), + hipMemcpyHostToDevice, + s1_)); + hipCheckError(hipMemcpyAsync(A_cols_device_, + A_cols_, + sizeof(int64_t) * nnz_, + hipMemcpyHostToDevice, + s1_)); + hipCheckError(hipMemcpyAsync(A_vals_device_, + A_vals_, + sizeof(T) * nnz_, + hipMemcpyHostToDevice, + s1_)); + hipCheckError(hipMemcpyAsync(x_device_, + x_, + sizeof(T) * n_, + hipMemcpyHostToDevice, + s2_)); + hipCheckError(hipMemcpyAsync(y_device_, + y_, + sizeof(T) * m_, + hipMemcpyHostToDevice, + s3_)); + hipCheckError(hipDeviceSynchronize()); + + if (print_) std::cout << "\tCreating rocSPARSE structures" << std::endl; + // Set up the rocSPARSE structures for the SpMDnV + status_ = rocsparse_create_csr_descr(&description_, + m_, + n_, + nnz_, + A_rows_device_, + A_cols_device_, + A_vals_device_, + index_, + index_, + base_, + dataType_); + checkStatus("Failed rocsparse_create_csr_descr"); + + status_ = rocsparse_create_dnvec_descr(&x_description_, + n_, + x_device_, + dataType_); + checkStatus("Failed rocsparse_create_dnvec_descr for x"); + + status_ = rocsparse_create_dnvec_descr(&y_description_, + m_, + y_device_, + dataType_); + checkStatus("Failed rocsparse_create_dnvec_descr for y"); + hipCheckError(hipDeviceSynchronize()); + + size_t buffer_size = 0; + status_ = rocsparse_spmv(handle_, + operation_, + &alpha, + description_, + x_description_, + &beta, + y_description_, + dataType_, + algorithm_, + rocsparse_spmv_stage_buffer_size, + &buffer_size, + nullptr); + checkStatus("Failed rocsparse_spmv_ex with rocsparse_spmv_stage_buffer_size"); + hipCheckError(hipDeviceSynchronize()); + + void* temp_buffer; + hipCheckError(hipMalloc(&temp_buffer, buffer_size)); + hipCheckError(hipDeviceSynchronize()); + + status_ = rocsparse_spmv(handle_, + operation_, + &alpha, + description_, + x_description_, + &beta, + y_description_, + dataType_, + algorithm_, + rocsparse_spmv_stage_preprocess, + &buffer_size, + temp_buffer); + checkStatus("Failed rocsparse_spmv_ex with rocsparse_spmv_stage_preprocess"); + hipCheckError(hipDeviceSynchronize()); + + status_ = rocsparse_spmv(handle_, + operation_, + &alpha, + description_, + x_description_, + &beta, + y_description_, + dataType_, + algorithm_, + rocsparse_spmv_stage_compute, + &buffer_size, + temp_buffer); + checkStatus("Failed rocsparse_spmv_ex with rocsparse_spmv_stage_compute"); + hipCheckError(hipDeviceSynchronize()); + + if (print_) std::cout << "\tdestroying rocSPARSE structures" << std::endl; + // Now clean up + status_ = rocsparse_destroy_spmat_descr(description_); + checkStatus("Failed rocsparse_destroy_spmat_descr"); + hipCheckError(hipFree(temp_buffer)); + + // Move result back to the CPU + if (print_) std::cout << "\tMoving data to CPU" << std::endl; + hipCheckError(hipMemcpyAsync(y_, y_device_, sizeof(T) * m_, hipMemcpyDeviceToHost, s3_)); + hipCheckError(hipDeviceSynchronize()); + break; + } + case gpuOffloadType::once: { + // Set up the rocSPARSE structures for the SpMDnV + if (print_) std::cout << "\tCreating rocSPARSE structures" << std::endl; + // Set up the rocSPARSE structures for the SpMDnV + status_ = rocsparse_create_csr_descr(&description_, + m_, + n_, + nnz_, + A_rows_device_, + A_cols_device_, + A_vals_device_, + index_, + index_, + base_, + dataType_); + checkStatus("Failed rocsparse_create_csr_descr"); + + status_ = rocsparse_create_dnvec_descr(&x_description_, + n_, + x_device_, + dataType_); + checkStatus("Failed rocsparse_create_dnvec_descr for x"); + + status_ = rocsparse_create_dnvec_descr(&y_description_, + m_, + y_device_, + dataType_); + checkStatus("Failed rocsparse_create_dnvec_descr for y"); + hipCheckError(hipDeviceSynchronize()); + + size_t buffer_size = 0; + status_ = rocsparse_spmv(handle_, + operation_, + &alpha, + description_, + x_description_, + &beta, + y_description_, + dataType_, + algorithm_, + rocsparse_spmv_stage_buffer_size, + &buffer_size, + nullptr); + checkStatus("Failed rocsparse_spmv_ex with rocsparse_spmv_stage_buffer_size"); + hipCheckError(hipDeviceSynchronize()); + + void* temp_buffer; + hipCheckError(hipMalloc(&temp_buffer, buffer_size)); + hipCheckError(hipDeviceSynchronize()); + + status_ = rocsparse_spmv(handle_, + operation_, + &alpha, + description_, + x_description_, + &beta, + y_description_, + dataType_, + algorithm_, + rocsparse_spmv_stage_preprocess, + &buffer_size, + temp_buffer); + checkStatus("Failed rocsparse_spmv_ex with rocsparse_spmv_stage_preprocess"); + hipCheckError(hipDeviceSynchronize()); + + status_ = rocsparse_spmv(handle_, + operation_, + &alpha, + description_, + x_description_, + &beta, + y_description_, + dataType_, + algorithm_, + rocsparse_spmv_stage_compute, + &buffer_size, + temp_buffer); + checkStatus("Failed rocsparse_spmv_ex with rocsparse_spmv_stage_compute"); + hipCheckError(hipDeviceSynchronize()); + + if (print_) std::cout << "\tdestroying rocSPARSE structures" << std::endl; + // Now clean up + status_ = rocsparse_destroy_spmat_descr(description_); + checkStatus("Failed rocsparse_destroy_spmat_descr"); + hipCheckError(hipFree(temp_buffer)); + break; + } + case gpuOffloadType::unified: { + // Set up the rocSPARSE structures for the SpMDnV + if (print_) std::cout << "\tCreating rocSPARSE structures" << std::endl; + // Set up the rocSPARSE structures for the SpMDnV + status_ = rocsparse_create_csr_descr(&description_, + m_, + n_, + nnz_, + A_rows_, + A_cols_, + A_vals_, + index_, + index_, + base_, + dataType_); + checkStatus("Failed rocsparse_create_csr_descr"); + + status_ = rocsparse_create_dnvec_descr(&x_description_, + n_, + x_, + dataType_); + checkStatus("Failed rocsparse_create_dnvec_descr for x"); + + status_ = rocsparse_create_dnvec_descr(&y_description_, + m_, + y_, + dataType_); + checkStatus("Failed rocsparse_create_dnvec_descr for y"); + hipCheckError(hipDeviceSynchronize()); + + size_t buffer_size = 0; + status_ = rocsparse_spmv(handle_, + operation_, + &alpha, + description_, + x_description_, + &beta, + y_description_, + dataType_, + algorithm_, + rocsparse_spmv_stage_buffer_size, + &buffer_size, + nullptr); + checkStatus("Failed rocsparse_spmv_ex with rocsparse_spmv_stage_buffer_size"); + hipCheckError(hipDeviceSynchronize()); + + void* temp_buffer; + hipCheckError(hipMalloc(&temp_buffer, buffer_size)); + hipCheckError(hipDeviceSynchronize()); + + status_ = rocsparse_spmv(handle_, + operation_, + &alpha, + description_, + x_description_, + &beta, + y_description_, + dataType_, + algorithm_, + rocsparse_spmv_stage_preprocess, + &buffer_size, + temp_buffer); + checkStatus("Failed rocsparse_spmv_ex with rocsparse_spmv_stage_preprocess"); + hipCheckError(hipDeviceSynchronize()); + + status_ = rocsparse_spmv(handle_, + operation_, + &alpha, + description_, + x_description_, + &beta, + y_description_, + dataType_, + algorithm_, + rocsparse_spmv_stage_compute, + &buffer_size, + temp_buffer); + checkStatus("Failed rocsparse_spmv_ex with rocsparse_spmv_stage_compute"); + hipCheckError(hipDeviceSynchronize()); + + if (print_) std::cout << "\tdestroying rocSPARSE structures" << std::endl; + // Now clean up + status_ = rocsparse_destroy_spmat_descr(description_); + checkStatus("Failed rocsparse_destroy_spmat_descr"); + hipCheckError(hipFree(temp_buffer)); + break; + } + } + } + + void postLoopRequirements() override { + if (print_) std::cout << "Post loop " << std::endl; + switch (offload_) { + case gpuOffloadType::always: { + break; + } + case gpuOffloadType::once: { + // Move result back to the CPU + if (print_) std::cout << "\tMovin data to CPU" << std::endl; + hipCheckError(hipMemcpyAsync(y_, y_device_, sizeof(T) * m_, hipMemcpyDeviceToHost, s3_)); + hipCheckError(hipDeviceSynchronize()); + break; + } + case gpuOffloadType::unified: { + // Ensure all output data resides on host once work has completed + if (print_) std::cout << "\tMovin data to CPU" << std::endl; + hipCheckError(hipMemPrefetchAsync(y_, sizeof(T) * m_, hipCpuDeviceId, s3_)); + // Ensure device has finished all work. + hipCheckError(hipDeviceSynchronize()); + break; + } + } + } + + void postCallKernelCleanup() override { + if (print_) std::cout << "Post-kernel cleanup" << std::endl; + if (offload_ == gpuOffloadType::unified) { + if (print_) std::cout << "\tFreeing unified arrays" << std::endl; + hipCheckError(hipFree(A_)); + hipCheckError(hipFree(A_rows_)); + hipCheckError(hipFree(A_cols_)); + hipCheckError(hipFree(A_vals_)); + hipCheckError(hipFree(x_)); + hipCheckError(hipFree(y_)); + } else { + if (print_) std::cout << "\tFreeing CPU arrays" << std::endl; + hipCheckError(hipHostFree((void*)A_)); + hipCheckError(hipHostFree((void*)A_rows_)); + hipCheckError(hipHostFree((void*)A_cols_)); + hipCheckError(hipHostFree((void*)A_vals_)); + hipCheckError(hipHostFree((void*)x_)); + hipCheckError(hipHostFree((void*)y_)); + + if (print_) std::cout << "\tFreeing GPU arrays" << std::endl; + hipCheckError(hipFree(A_rows_device_)); + hipCheckError(hipFree(A_cols_device_)); + hipCheckError(hipFree(A_vals_device_)); + hipCheckError(hipFree(x_device_)); + hipCheckError(hipFree(y_device_)); + } + } + + void checkStatus(std::string message) { + if (status_ != rocsparse_status_success) { + std::cerr << message << std::endl; + exit(1); + } + } + + bool initialised_ = false; + + bool print_ = false; + + rocsparse_status status_; + rocsparse_operation operation_; + rocsparse_handle handle_; + rocsparse_indextype index_; + rocsparse_matrix_type type_; + rocsparse_index_base base_; + rocsparse_datatype dataType_; + rocsparse_spmv_alg algorithm_; + + rocsparse_spmat_descr description_; + rocsparse_dnvec_descr x_description_; + rocsparse_dnvec_descr y_description_; + + + int64_t* A_rows_; + int64_t* A_cols_; + T* A_vals_; + + int64_t* A_rows_device_; + int64_t* A_cols_device_; + T* A_vals_device_; + T* x_device_; + T* y_device_; + + int gpuDevice_; + hipStream_t s1_, s2_, s3_; + + const T alpha = ALPHA; + const T beta = BETA; +}; +} + +#endif diff --git a/rocBLAS/spmspm.hh b/rocBLAS/spmspm.hh new file mode 100644 index 0000000..30c2703 --- /dev/null +++ b/rocBLAS/spmspm.hh @@ -0,0 +1,1107 @@ +#pragma once + +#ifdef GPU_ROCBLAS +#include +#include + +#include +#include "../include/kernels/GPU/spmspm.hh" +#include "../include/utilities.hh" +#include "common.hh" + +namespace gpu { +template +class spmspm_gpu : public spmspm { +public: + using spmspm::spmspm; + using spmspm::initInputMatrices; + using spmspm::A_nnz_; + using spmspm::B_nnz_; + using spmspm::m_; + using spmspm::n_; + using spmspm::k_; + using spmspm::A_; + using spmspm::B_; + using spmspm::C_; + using spmspm::offload_; + using spmspm::sparsity_; + using spmspm::C_nnz_; + using spmspm::C_vals_; + using spmspm::C_rows_; + using spmspm::C_cols_; + + ~spmspm_gpu() { + if (initialised_) { + status_ = rocsparse_destroy_handle(handle_); + checkStatus("Failed rocsparse_destroy_handle"); + hipCheckError(hipStreamDestroy(stream_)); + initialised_ = false; + } + } + + void initialise(gpuOffloadType offload, int m, int n, int k, + double sparsity, bool binary = false) override { + if (print_) { + switch (offload) { + case gpuOffloadType::always: { + std::cout << "=========== ALWAYS ===========" << std::endl; + break; + } + case gpuOffloadType::once: { + std::cout << "=========== ONCE ===========" << std::endl; + break; + } + case gpuOffloadType::unified: { + std::cout << "=========== UNIFIED ===========" << std::endl; + break; + } + } + } + + if (print_) std::cout << "Initialising " << m << "x" << k << " . " << k << "x" << n << std::endl; + firstRun_ = true; + + m_ = m; + n_ = n; + k_ = k; + sparsity_ = sparsity; + offload_ = offload; + A_nnz_ = 1 + (int64_t)((double)m_ * (double)k_ * (1.0 - sparsity_)); + B_nnz_ = 1 + (int64_t)((double)k_ * (double)n_ * (1.0 - sparsity_)); + + // Set up rocSPARSE metadata + index_ = rocsparse_indextype_i64; + base_ = rocsparse_index_base_zero; + type_ = rocsparse_matrix_type_general; + operation_ = rocsparse_operation_none; + algorithm_ = rocsparse_spgemm_alg_default; + + if constexpr (std::is_same_v) { + dataType_ = rocsparse_datatype_f32_r; + } else if constexpr (std::is_same_v) { + dataType_ = rocsparse_datatype_f64_r; + } else { + static_assert("Unsupported data type for rocSPARSE"); + } + + if (print_) std::cout << "\tAbout to set up handle and hip streams" << std::endl; + if (!initialised_) { + status_ = rocsparse_create_handle(&handle_); + checkStatus("Failed rocsparse_create_handle"); + + // Get the GPU + hipCheckError(hipGetDevice(&gpuDevice_)); + // Make streams for asynchronous GPU comunication + hipCheckError(hipStreamCreate(&stream_)); + + status_ = rocsparse_set_stream(handle_, stream_); + checkStatus("Failed rocsparse_get_stream"); + } + + if (print_) std::cout << "\tAbout to malloc arrays" << std::endl; + if (offload_ == gpuOffloadType::unified) { + hipCheckError(hipMallocManaged(&A_, sizeof(T) * m_ * k_)); + hipCheckError(hipMallocManaged(&A_rows_, sizeof(int64_t) * (m_ + 1))); + hipCheckError(hipMallocManaged(&A_cols_, sizeof(int64_t) * A_nnz_)); + hipCheckError(hipMallocManaged(&A_vals_, sizeof(T) * A_nnz_)); + hipCheckError(hipMallocManaged(&B_, sizeof(T) * k_ * n_)); + hipCheckError(hipMallocManaged(&B_rows_, sizeof(int64_t) * (k_ + 1))); + hipCheckError(hipMallocManaged(&B_cols_, sizeof(int64_t) * B_nnz_)); + hipCheckError(hipMallocManaged(&B_vals_, sizeof(T) * B_nnz_)); + hipCheckError(hipMallocManaged(&D_rows_, sizeof(int64_t) * (m_ + 1))); + hipCheckError(hipMallocManaged(&D_cols_, sizeof(int64_t) * D_nnz_)); + hipCheckError(hipMallocManaged(&D_vals_, sizeof(T) * D_nnz_)); + + hipCheckError(hipDeviceSynchronize()); + } else { + // Host data structures + hipCheckError(hipHostMalloc(&A_, sizeof(T) * m_ * k_)); + hipCheckError(hipHostMalloc(&A_rows_, sizeof(int64_t) * (m_ + 1))); + hipCheckError(hipHostMalloc(&A_cols_, sizeof(int64_t) * A_nnz_)); + hipCheckError(hipHostMalloc(&A_vals_, sizeof(T) * A_nnz_)); + hipCheckError(hipHostMalloc(&B_, sizeof(T) * k_ * n_)); + hipCheckError(hipHostMalloc(&B_rows_, sizeof(int64_t) * (k_ + 1))); + hipCheckError(hipHostMalloc(&B_cols_, sizeof(int64_t) * B_nnz_)); + hipCheckError(hipHostMalloc(&B_vals_, sizeof(T) * B_nnz_)); + hipCheckError(hipHostMalloc(&D_rows_, sizeof(int64_t) * (m_ + 1))); + hipCheckError(hipHostMalloc(&D_cols_, sizeof(int64_t) * D_nnz_)); + hipCheckError(hipHostMalloc(&D_vals_, sizeof(T) * D_nnz_)); + hipCheckError(hipDeviceSynchronize()); + + // GPU data structures + hipCheckError(hipMalloc(&A_rows_device_, sizeof(int64_t) * (m_ + 1))); + hipCheckError(hipMalloc(&A_cols_device_, sizeof(int64_t) * A_nnz_)); + hipCheckError(hipMalloc(&A_vals_device_, sizeof(T) * A_nnz_)); + hipCheckError(hipMalloc(&B_rows_device_, sizeof(int64_t) * (k_ + 1))); + hipCheckError(hipMalloc(&B_cols_device_, sizeof(int64_t) * B_nnz_)); + hipCheckError(hipMalloc(&B_vals_device_, sizeof(T) * B_nnz_)); + hipCheckError(hipMalloc(&D_rows_device_, sizeof(int64_t) * (m_ + 1))); + hipCheckError(hipMalloc(&D_cols_device_, sizeof(int64_t) * D_nnz_)); + hipCheckError(hipMalloc(&D_vals_device_, sizeof(T) * D_nnz_)); + hipCheckError(hipDeviceSynchronize()); + } + + + if (print_) std::cout << "\tInitialising matrices" << std::endl; + uint64_t outputNNZ = 0; + while (outputNNZ == 0) { + initInputMatrices(); + outputNNZ = calcNNZC(); + } + } + +protected: + void toSparseFormat() override { + if (print_) std::cout << "Making sparse now" << std::endl; + int64_t nnz_encountered = 0; + + if (print_) std::cout << "\tA into CSR" << std::endl; + // Convert A to CSR format + A_rows_[0] = 0; + + for (int64_t row = 0; row < m_; row++) { + for (int64_t col = 0; col < k_; col++) { + if (A_[(row * k_) + col] != 0.0) { + A_cols_[nnz_encountered] = col; + A_vals_[nnz_encountered] = static_cast(A_[(row * k_) + col]); + nnz_encountered++; + } + } + A_rows_[row + 1] = nnz_encountered; + } + + // Verify A conversion + if (nnz_encountered != A_nnz_) { + std::cerr << "Warning: A matrix has " << nnz_encountered << " non-zeros, expected " << A_nnz_ << std::endl; + A_nnz_ = nnz_encountered; // Update to actual count + } + + if (print_) std::cout << "\tB into CSR" << std::endl; + // Convert B to CSR format + nnz_encountered = 0; + + B_rows_[0] = 0; + + for (int64_t row = 0; row < k_; row++) { + for (int64_t col = 0; col < n_; col++) { + if (B_[(row * n_) + col] != 0.0) { + B_cols_[nnz_encountered] = col; + B_vals_[nnz_encountered] = static_cast(B_[(row * n_) + col]); + nnz_encountered++; + } + } + B_rows_[row + 1] = nnz_encountered; + } + + // Verify B conversion + if (nnz_encountered != B_nnz_) { + std::cerr << "Warning: B matrix has " << nnz_encountered << " non-zeros, expected " << B_nnz_ << std::endl; + B_nnz_ = nnz_encountered; // Update to actual count + } + + // Make D a possible matrix + D_cols_[0] = 0; + D_vals_[0] = 1.0; + D_rows_[0] = 0; + for (size_t i = 1; i < (m_ + 1); i++) { + D_rows_[i] = 1; + } + + // Ensure synchronization for unified memory + hipCheckError(hipDeviceSynchronize()); + } + +private: + void preLoopRequirements() override { + if (print_) std::cout << "pre-loop stuff" << std::endl; + switch (offload_) { + case gpuOffloadType::always: { + break; + } + case gpuOffloadType::once: { + if (print_) std::cout << "\tMoving data to GPU" << std::endl; + hipCheckError(hipMemcpyAsync(A_rows_device_, + A_rows_, + sizeof(int64_t) * (m_ + 1), + hipMemcpyHostToDevice, + stream_)); + hipCheckError(hipMemcpyAsync(A_cols_device_, + A_cols_, + sizeof(int64_t) * A_nnz_, + hipMemcpyHostToDevice, + stream_)); + hipCheckError(hipMemcpyAsync(A_vals_device_, + A_vals_, + sizeof(T) * A_nnz_, + hipMemcpyHostToDevice, + stream_)); + hipCheckError(hipMemcpyAsync(B_rows_device_, + B_rows_, + sizeof(int64_t) * (k_ + 1), + hipMemcpyHostToDevice, + stream_)); + hipCheckError(hipMemcpyAsync(B_cols_device_, + B_cols_, + sizeof(int64_t) * B_nnz_, + hipMemcpyHostToDevice, + stream_)); + hipCheckError(hipMemcpyAsync(B_vals_device_, + B_vals_, + sizeof(T) * B_nnz_, + hipMemcpyHostToDevice, + stream_)); + hipCheckError(hipMemcpyAsync(D_rows_device_, + D_rows_, + sizeof(int64_t) * (m_ + 1), + hipMemcpyHostToDevice, + stream_)); + hipCheckError(hipMemcpyAsync(D_cols_device_, + D_cols_, + sizeof(int64_t) * D_nnz_, + hipMemcpyHostToDevice, + stream_)); + hipCheckError(hipMemcpyAsync(D_vals_device_, + D_vals_, + sizeof(T) * D_nnz_, + hipMemcpyHostToDevice, + stream_)); + hipCheckError(hipDeviceSynchronize()); + break; + } + case gpuOffloadType::unified: { + if (print_) std::cout << "\tMoving data to GPU" << std::endl; + hipCheckError(hipMemPrefetchAsync(A_rows_, + sizeof(int64_t) * (m_ + 1), + gpuDevice_, + stream_)); + hipCheckError(hipMemPrefetchAsync(A_cols_, + sizeof(int64_t) * A_nnz_, + gpuDevice_, + stream_)); + hipCheckError(hipMemPrefetchAsync(A_vals_, + sizeof(T) * A_nnz_, + gpuDevice_, + stream_)); + hipCheckError(hipMemPrefetchAsync(B_rows_, + sizeof(int64_t) * (k_ + 1), + gpuDevice_, + stream_)); + hipCheckError(hipMemPrefetchAsync(B_cols_, + sizeof(int64_t) * B_nnz_, + gpuDevice_, + stream_)); + hipCheckError(hipMemPrefetchAsync(B_vals_, + sizeof(T) * B_nnz_, + gpuDevice_, + stream_)); + hipCheckError(hipMemPrefetchAsync(D_rows_, + sizeof(int64_t) * (m_ + 1), + gpuDevice_, + stream_)); + hipCheckError(hipMemPrefetchAsync(D_cols_, + sizeof(int64_t) * D_nnz_, + gpuDevice_, + stream_)); + hipCheckError(hipMemPrefetchAsync(D_vals_, + sizeof(T) * D_nnz_, + gpuDevice_, + stream_)); + hipCheckError(hipDeviceSynchronize()); + break; + } + } + } + + void callSpmspm() override { + if (print_) std::cout << "Calling spmspm kernel" << std::endl; + switch (offload_) { + case gpuOffloadType::unified: { + size_t buffer_size = 0; + // Check if there are old arrays to get rid of + if (!firstRun_) { + hipCheckError(hipFree(C_rows_)); + hipCheckError(hipFree(C_cols_)); + hipCheckError(hipFree(C_vals_)); + hipCheckError(hipDeviceSynchronize()); + } + if (print_) std::cout << "\tAllocating C rows" << std::endl; + hipCheckError(hipMallocManaged(&C_rows_, sizeof(int64_t) * (m_ + 1))); + hipCheckError(hipDeviceSynchronize()); + + // Set up the rocSPARSE structures for the MM + if (print_) std::cout << "\tCreating csr descriptions" << std::endl; + status_ = rocsparse_create_csr_descr(&description_A_, m_, k_, A_nnz_, A_rows_, A_cols_, + A_vals_, index_, index_, base_, dataType_); + checkStatus("Failed rocsparse_create_csr_descr"); + status_ = rocsparse_create_csr_descr(&description_B_, k_, n_, B_nnz_, B_rows_, B_cols_, + B_vals_, index_, index_, base_, dataType_); + checkStatus("Failed rocsparse_create_csr_descr"); + status_ = rocsparse_create_csr_descr(&description_C_, m_, n_, 0, C_rows_, nullptr, + nullptr, index_, index_, base_, dataType_); + checkStatus("Failed rocsparse_create_csr_descr"); + status_ = rocsparse_create_csr_descr(&description_D_, m_, n_, D_nnz_, D_rows_, D_cols_, + D_vals_, index_, index_, base_, dataType_); + checkStatus("Failed rocsparse_create_csr_descr"); + hipCheckError(hipDeviceSynchronize()); + + if (print_) std::cout << "\tDetermining buffer size" << std::endl; + stage_ = rocsparse_spgemm_stage_buffer_size; + status_ = rocsparse_spgemm(handle_, + operation_, + operation_, + &alpha, + description_A_, + description_B_, + &beta, + description_D_, + description_C_, + dataType_, + algorithm_, + stage_, + &buffer_size, + nullptr); + checkStatus("Failed rocsparse_spgemm with stage=rocsparse_spgemm_stage_buffer_size"); + hipCheckError(hipDeviceSynchronize()); + + if (print_) std::cout << "\tAllocating buffer and C_rows" << std::endl; + void* buffer; + hipCheckError(hipMallocManaged(&buffer, buffer_size)); + hipCheckError(hipDeviceSynchronize()); + + if (print_) std::cout << "\tDetermining nnz" << std::endl; + stage_ = rocsparse_spgemm_stage_nnz; + status_ = rocsparse_spgemm(handle_, + operation_, + operation_, + &alpha, + description_A_, + description_B_, + &beta, + description_D_, + description_C_, + dataType_, + algorithm_, + stage_, + &buffer_size, + buffer); + checkStatus("Failed rocsparse_spgemm with stage=rocsparse_spgemm_stage_nnz"); + hipCheckError(hipDeviceSynchronize()); + + if (print_) std::cout << "\tAllocating rows and vals" << std::endl; + int64_t rowC, colC; + status_ = rocsparse_spmat_get_size(description_C_, &rowC, &colC, &C_nnz_); + checkStatus("Failed rocsparse_spmat_get_size"); + + hipCheckError(hipMallocManaged(&C_cols_, sizeof(int64_t) * C_nnz_)); + hipCheckError(hipMallocManaged(&C_vals_, sizeof(T) * C_nnz_)); + hipCheckError(hipDeviceSynchronize()); + + status_ = rocsparse_csr_set_pointers(description_C_, C_rows_, C_cols_, C_vals_); + checkStatus("Failed rocsparse_csr_set_pointers"); + hipCheckError(hipDeviceSynchronize()); + + if (print_) std::cout << "\tDoing calculation" << std::endl; + stage_ = rocsparse_spgemm_stage_compute; + status_ = rocsparse_spgemm(handle_, + operation_, + operation_, + &alpha, + description_A_, + description_B_, + &beta, + description_D_, + description_C_, + dataType_, + algorithm_, + stage_, + &buffer_size, + buffer); + checkStatus("Failed rocsparse_spgemm with stage=rocsparse_spgemm_stage_compute"); + hipCheckError(hipDeviceSynchronize()); + + + if (print_) std::cout << "\tFreeing buffer and descriptions etc." << std::endl; + // Freeing up buffer + hipCheckError(hipFree(buffer)); + status_ = rocsparse_destroy_spmat_descr(description_A_); + checkStatus("Failing rocsparse_destroy_mat_descr for A"); + status_ = rocsparse_destroy_spmat_descr(description_B_); + checkStatus("Failing rocsparse_destroy_mat_descr for B"); + status_ = rocsparse_destroy_spmat_descr(description_C_); + checkStatus("Failing rocsparse_destroy_mat_descr for C"); + status_ = rocsparse_destroy_spmat_descr(description_D_); + checkStatus("Failing rocsparse_destroy_mat_descr for D"); + hipCheckError(hipDeviceSynchronize()); + firstRun_ = false; + break; + } + case gpuOffloadType::always: { + if (print_) std::cout << "\tMoving data to GPU" << std::endl; + hipCheckError(hipMemcpyAsync(A_rows_device_, + A_rows_, + sizeof(int64_t) * (m_ + 1), + hipMemcpyHostToDevice, + stream_)); + hipCheckError(hipMemcpyAsync(A_cols_device_, + A_cols_, + sizeof(int64_t) * A_nnz_, + hipMemcpyHostToDevice, + stream_)); + hipCheckError(hipMemcpyAsync(A_vals_device_, + A_vals_, + sizeof(T) * A_nnz_, + hipMemcpyHostToDevice, + stream_)); + hipCheckError(hipMemcpyAsync(B_rows_device_, + B_rows_, + sizeof(int64_t) * (k_ + 1), + hipMemcpyHostToDevice, + stream_)); + hipCheckError(hipMemcpyAsync(B_cols_device_, + B_cols_, + sizeof(int64_t) * B_nnz_, + hipMemcpyHostToDevice, + stream_)); + hipCheckError(hipMemcpyAsync(B_vals_device_, + B_vals_, + sizeof(T) * B_nnz_, + hipMemcpyHostToDevice, + stream_)); + hipCheckError(hipMemcpyAsync(D_rows_device_, + D_rows_, + sizeof(int64_t) * (m_ + 1), + hipMemcpyHostToDevice, + stream_)); + hipCheckError(hipMemcpyAsync(D_cols_device_, + D_cols_, + sizeof(int64_t) * D_nnz_, + hipMemcpyHostToDevice, + stream_)); + hipCheckError(hipMemcpyAsync(D_vals_device_, + D_vals_, + sizeof(T) * D_nnz_, + hipMemcpyHostToDevice, + stream_)); + hipCheckError(hipDeviceSynchronize()); + size_t buffer_size = 0; + + if (print_) std::cout << "\tAllocating C rows" << std::endl; + hipCheckError(hipMalloc((void**)&C_rows_device_, sizeof(int64_t) * (m_ + 1))); + hipCheckError(hipDeviceSynchronize()); + + // Set up the rocSPARSE structures for the MM + if (print_) std::cout << "\tCreating csr descriptions" << std::endl; + status_ = rocsparse_create_csr_descr(&description_A_, m_, k_, A_nnz_, A_rows_device_, A_cols_device_, + A_vals_device_, index_, index_, base_, dataType_); + checkStatus("Failed rocsparse_create_csr_descr"); + status_ = rocsparse_create_csr_descr(&description_B_, k_, n_, B_nnz_, B_rows_device_, B_cols_device_, + B_vals_device_, index_, index_, base_, dataType_); + checkStatus("Failed rocsparse_create_csr_descr"); + status_ = rocsparse_create_csr_descr(&description_C_, m_, n_, 0, C_rows_device_, nullptr, + nullptr, index_, index_, base_, dataType_); + checkStatus("Failed rocsparse_create_csr_descr"); + status_ = rocsparse_create_csr_descr(&description_D_, m_, n_, D_nnz_, D_rows_device_, D_cols_device_, + D_vals_device_, index_, index_, base_, dataType_); + checkStatus("Failed rocsparse_create_csr_descr"); + + + if (print_) std::cout << "\tDetermining buffer size" << std::endl; + stage_ = rocsparse_spgemm_stage_buffer_size; + status_ = rocsparse_spgemm(handle_, + operation_, + operation_, + &alpha, + description_A_, + description_B_, + &beta, + description_D_, + description_C_, + dataType_, + algorithm_, + stage_, + &buffer_size, + nullptr); + checkStatus("Failed rocsparse_spgemm with stage=rocsparse_spgemm_stage_buffer_size"); + + if (print_) std::cout << "\tAllocating buffer and C_rows" << std::endl; + void* buffer; + hipCheckError(hipMalloc(&buffer, buffer_size)); + + if (print_) std::cout << "\tDetermining nnz" << std::endl; + stage_ = rocsparse_spgemm_stage_nnz; + status_ = rocsparse_spgemm(handle_, + operation_, + operation_, + &alpha, + description_A_, + description_B_, + &beta, + description_D_, + description_C_, + dataType_, + algorithm_, + stage_, + &buffer_size, + buffer); + checkStatus("Failed rocsparse_spgemm with stage=rocsparse_spgemm_stage_nnz"); + + if (print_) std::cout << "\tAllocating rows and vals" << std::endl; + int64_t rowC, colC; + status_ = rocsparse_spmat_get_size(description_C_, &rowC, &colC, &C_nnz_); + checkStatus("Failed rocsparse_spmat_get_size"); + + hipCheckError(hipMalloc((void**)&C_cols_device_, sizeof(int64_t) * C_nnz_)); + hipCheckError(hipMalloc((void**)&C_vals_device_, sizeof(T) * C_nnz_)); + + status_ = rocsparse_csr_set_pointers(description_C_, + C_rows_device_, + C_cols_device_, + C_vals_device_); + checkStatus("Failed rocsparse_csr_set_pointers"); + + if (print_) std::cout << "\tDoing calculation" << std::endl; + stage_ = rocsparse_spgemm_stage_compute; + status_ = rocsparse_spgemm(handle_, + operation_, + operation_, + &alpha, + description_A_, + description_B_, + &beta, + description_D_, + description_C_, + dataType_, + algorithm_, + stage_, + &buffer_size, + buffer); + checkStatus("Failed rocsparse_spgemm with stage=rocsparse_spgemm_stage_compute"); + + + if (print_) std::cout << "\tFreeing buffer and descriptions etc." << std::endl; + // Freeing up buffer + hipCheckError(hipFree(buffer)); + status_ = rocsparse_destroy_spmat_descr(description_A_); + checkStatus("Failing rocsparse_destroy_mat_descr for A"); + status_ = rocsparse_destroy_spmat_descr(description_B_); + checkStatus("Failing rocsparse_destroy_mat_descr for B"); + status_ = rocsparse_destroy_spmat_descr(description_C_); + checkStatus("Failing rocsparse_destroy_mat_descr for C"); + status_ = rocsparse_destroy_spmat_descr(description_D_); + checkStatus("Failing rocsparse_destroy_mat_descr for D"); + + if (print_) std::cout << "\tAllocating host C arrays" << std::endl; + // Allocate host arrays for C + hipCheckError(hipHostMalloc((void**)&C_rows_, sizeof(int64_t) * (m_ + 1))); + hipCheckError(hipHostMalloc((void**)&C_cols_, sizeof(int64_t) * C_nnz_)); + hipCheckError(hipHostMalloc((void**)&C_vals_, sizeof(T) * C_nnz_)); + hipCheckError(hipDeviceSynchronize()); + + // Moving data to CPU + if (print_) std::cout << "\tTransfering data back to CPU" << std::endl; + hipCheckError(hipMemcpyAsync(C_rows_, + C_rows_device_, + sizeof(int64_t) * (m_ + 1), + hipMemcpyDeviceToHost, + stream_)); + hipCheckError(hipMemcpyAsync(C_cols_, + C_cols_device_, + sizeof(int64_t) * C_nnz_, + hipMemcpyDeviceToHost, + stream_)); + hipCheckError(hipMemcpyAsync(C_vals_, + C_vals_device_, + sizeof(T) * C_nnz_, + hipMemcpyDeviceToHost, + stream_)); + hipCheckError(hipDeviceSynchronize()); + + // Freeing stuff up + if (print_) std::cout << "\tFreeing C arrays (host and device)" << std::endl; + hipCheckError(hipFree(C_rows_device_)); + hipCheckError(hipFree(C_cols_device_)); + hipCheckError(hipFree(C_vals_device_)); + hipCheckError(hipFree(C_rows_)); + hipCheckError(hipFree(C_cols_)); + hipCheckError(hipFree(C_vals_)); + hipCheckError(hipDeviceSynchronize()); + break; + } + case gpuOffloadType::once: { + size_t buffer_size; + // Check if there are old arrays to get rid of + if (!firstRun_) { + hipCheckError(hipFree(C_rows_device_)); + hipCheckError(hipFree(C_cols_device_)); + hipCheckError(hipFree(C_vals_device_)); + hipCheckError(hipDeviceSynchronize()); + } + + if (print_) std::cout << "\tAllocating C rows" << std::endl; + hipCheckError(hipMalloc((void**)&C_rows_device_, sizeof(int64_t) * (m_ + 1))); + hipCheckError(hipDeviceSynchronize()); + + // Set up the rocSPARSE structures for the MM + if (print_) std::cout << "\tCreating csr descriptions" << std::endl; + status_ = rocsparse_create_csr_descr(&description_A_, + m_, + k_, + A_nnz_, + A_rows_device_, + A_cols_device_, + A_vals_device_, + index_, + index_, + base_, + dataType_); + checkStatus("Failed rocsparse_create_csr_descr"); + status_ = rocsparse_create_csr_descr(&description_B_, + k_, + n_, + B_nnz_, + B_rows_device_, + B_cols_device_, + B_vals_device_, + index_, + index_, + base_, + dataType_); + checkStatus("Failed rocsparse_create_csr_descr"); + status_ = rocsparse_create_csr_descr(&description_C_, + m_, + n_, + 0, + C_rows_device_, + nullptr, + nullptr, + index_, + index_, + base_, + dataType_); + checkStatus("Failed rocsparse_create_csr_descr"); + status_ = rocsparse_create_csr_descr(&description_D_, + m_, + n_, + D_nnz_, + D_rows_device_, + D_cols_device_, + D_vals_device_, + index_, + index_, + base_, + dataType_); + checkStatus("Failed rocsparse_create_csr_descr"); + + if (print_) std::cout << "\tDetermining buffer size" << std::endl; + stage_ = rocsparse_spgemm_stage_buffer_size; + status_ = rocsparse_spgemm(handle_, + operation_, + operation_, + &alpha, + description_A_, + description_B_, + &beta, + description_D_, + description_C_, + dataType_, + algorithm_, + stage_, + &buffer_size, + nullptr); + checkStatus("Failed rocsparse_spgemm with stage=rocsparse_spgemm_stage_buffer_size"); + + if (print_) std::cout << "\tAllocating buffer and C_rows" << std::endl; + void* buffer; + hipCheckError(hipMalloc(&buffer, buffer_size)); + + if (print_) std::cout << "\tDetermining nnz" << std::endl; + stage_ = rocsparse_spgemm_stage_nnz; + status_ = rocsparse_spgemm(handle_, + operation_, + operation_, + &alpha, + description_A_, + description_B_, + &beta, + description_D_, + description_C_, + dataType_, + algorithm_, + stage_, + &buffer_size, + buffer); + checkStatus("Failed rocsparse_spgemm with stage=rocsparse_spgemm_stage_nnz"); + + if (print_) std::cout << "\tAllocating rows and vals" << std::endl; + int64_t rowC, colC; + status_ = rocsparse_spmat_get_size(description_C_, &rowC, &colC, &C_nnz_); + checkStatus("Failed rocsparse_spmat_get_size"); + + hipCheckError(hipMalloc((void**)&C_cols_device_, sizeof(int64_t) * C_nnz_)); + hipCheckError(hipMalloc((void**)&C_vals_device_, sizeof(T) * C_nnz_)); + + status_ = rocsparse_csr_set_pointers(description_C_, + C_rows_device_, + C_cols_device_, + C_vals_device_); + checkStatus("Failed rocsparse_csr_set_pointers"); + + if (print_) std::cout << "\tDoing calculation" << std::endl; + stage_ = rocsparse_spgemm_stage_compute; + status_ = rocsparse_spgemm(handle_, + operation_, + operation_, + &alpha, + description_A_, + description_B_, + &beta, + description_D_, + description_C_, + dataType_, + algorithm_, + stage_, + &buffer_size, + buffer); + checkStatus("Failed rocsparse_spgemm with stage=rocsparse_spgemm_stage_compute"); + + + if (print_) std::cout << "\tFreeing buffer and descriptions etc." << std::endl; + // Freeing up buffer + hipCheckError(hipFree(buffer)); + status_ = rocsparse_destroy_spmat_descr(description_A_); + checkStatus("Failing rocsparse_destroy_mat_descr for A"); + status_ = rocsparse_destroy_spmat_descr(description_B_); + checkStatus("Failing rocsparse_destroy_mat_descr for B"); + status_ = rocsparse_destroy_spmat_descr(description_C_); + checkStatus("Failing rocsparse_destroy_mat_descr for C"); + status_ = rocsparse_destroy_spmat_descr(description_D_); + checkStatus("Failing rocsparse_destroy_mat_descr for D"); + firstRun_ = false; + break; + } + } + } + + void postLoopRequirements() override { + if (print_) std::cout << "Post-Loop stuff" << std::endl; + switch(offload_) { + case gpuOffloadType::always: { + break; + } + case gpuOffloadType::once: { + if (print_) std::cout << "\tAllocating host arrays for C" << std::endl; + // Allocate host arrays for C + hipCheckError(hipHostMalloc((void**)&C_rows_, sizeof(int64_t) * (m_ + 1))); + hipCheckError(hipHostMalloc((void**)&C_cols_, sizeof(int64_t) * C_nnz_)); + hipCheckError(hipHostMalloc((void**)&C_vals_, sizeof(T) * C_nnz_)); + hipCheckError(hipDeviceSynchronize()); + + + if (print_) std::cout << "\tMoving C data to host" << std::endl; + // Moving data to CPU + hipCheckError(hipMemcpyAsync(C_rows_, + C_rows_device_, + sizeof(int64_t) * (m_ + 1), + hipMemcpyDeviceToHost, + stream_)); + hipCheckError(hipMemcpyAsync(C_cols_, + C_cols_device_, + sizeof(int64_t) * C_nnz_, + hipMemcpyDeviceToHost, + stream_)); + hipCheckError(hipMemcpyAsync(C_vals_, + C_vals_device_, + sizeof(T) * C_nnz_, + hipMemcpyDeviceToHost, + stream_)); + hipCheckError(hipDeviceSynchronize()); + + // Freeing stuff up + if (print_) std::cout << "\tFreeing C arrays (host and device)" << std::endl; + hipCheckError(hipFree(C_rows_device_)); + hipCheckError(hipFree(C_cols_device_)); + hipCheckError(hipFree(C_vals_device_)); + hipCheckError(hipFree(C_rows_)); + hipCheckError(hipFree(C_cols_)); + hipCheckError(hipFree(C_vals_)); + hipCheckError(hipDeviceSynchronize()); + break; + } + case gpuOffloadType::unified: { + if (print_) std::cout << "\tMoving data to CPU" << std::endl; + hipCheckError(hipMemPrefetchAsync(C_rows_, + sizeof(int64_t) * (m_ + 1), + hipCpuDeviceId, + stream_)); + hipCheckError(hipMemPrefetchAsync(C_cols_, + sizeof(int64_t) * C_nnz_, + hipCpuDeviceId, + stream_)); + hipCheckError(hipMemPrefetchAsync(C_vals_, + sizeof(T) * C_nnz_, + hipCpuDeviceId, + stream_)); + hipCheckError(hipDeviceSynchronize()); + if (print_) std::cout << "\tFreeing C arrays" << std::endl; + hipCheckError(hipFree(C_rows_)); + hipCheckError(hipFree(C_cols_)); + hipCheckError(hipFree(C_vals_)); + hipCheckError(hipDeviceSynchronize()); + break; + } + } + } + + void postCallKernelCleanup() override { + if (print_) std::cout << "Post-kernel clean up" << std::endl; + if (offload_ == gpuOffloadType::unified) { + if (print_) std::cout << "Freeing unified memory arrays for A and B" << std::endl; + hipCheckError(hipFree(A_)); + hipCheckError(hipFree(A_rows_)); + hipCheckError(hipFree(A_cols_)); + hipCheckError(hipFree(A_vals_)); + hipCheckError(hipFree(B_)); + hipCheckError(hipFree(B_rows_)); + hipCheckError(hipFree(B_cols_)); + hipCheckError(hipFree(B_vals_)); + hipCheckError(hipFree(D_rows_)); + hipCheckError(hipFree(D_cols_)); + hipCheckError(hipFree(D_vals_)); + hipCheckError(hipDeviceSynchronize()); + } else { + if (print_) std::cout << "Freeing host arrays for A and B" << std::endl; + hipCheckError(hipHostFree((void*)A_)); + hipCheckError(hipHostFree((void*)A_rows_)); + hipCheckError(hipHostFree((void*)A_cols_)); + hipCheckError(hipHostFree((void*)A_vals_)); + hipCheckError(hipHostFree((void*)B_)); + hipCheckError(hipHostFree((void*)B_rows_)); + hipCheckError(hipHostFree((void*)B_cols_)); + hipCheckError(hipHostFree((void*)B_vals_)); + hipCheckError(hipHostFree((void*)D_rows_)); + hipCheckError(hipHostFree((void*)D_cols_)); + hipCheckError(hipHostFree((void*)D_vals_)); + hipCheckError(hipDeviceSynchronize()); + if (print_) std::cout << "Freeing GPU arrays for A and B" << std::endl; + hipCheckError(hipFree(A_rows_device_)); + hipCheckError(hipFree(A_cols_device_)); + hipCheckError(hipFree(A_vals_device_)); + hipCheckError(hipFree(B_rows_device_)); + hipCheckError(hipFree(B_cols_device_)); + hipCheckError(hipFree(B_vals_device_)); + hipCheckError(hipFree(D_rows_device_)); + hipCheckError(hipFree(D_cols_device_)); + hipCheckError(hipFree(D_vals_device_)); + hipCheckError(hipDeviceSynchronize()); + } + } + + void checkStatus(std::string message) { + if (status_ != rocsparse_status_success) { + std::cerr << message << " error = "; + switch (status_) { + case rocsparse_status_success: { + std::cerr << "Success" << std::endl; + break; + } + case rocsparse_status_invalid_handle: { + std::cerr << "invalid handle (handle not initialized, invalid or null.)" << std::endl; + break; + } + case rocsparse_status_not_implemented: { + std::cerr << "not imlpemented (function is not implemented.)" << std::endl; + break; + } + case rocsparse_status_invalid_pointer: { + std::cerr << "invalid pointer (invalid pointer parameter.)" << std::endl; + break; + } + case rocsparse_status_invalid_size: { + std::cerr << "invalid size (invalid size parameter.)" << std::endl; + break; + } + case rocsparse_status_memory_error: { + std::cerr << "memory error (failed memory allocation, copy, dealloc.)" << std::endl; + break; + } + case rocsparse_status_internal_error: { + std::cerr << "internal error (other internal library failure.)" << std::endl; + break; + } + case rocsparse_status_invalid_value: { + std::cerr << "invalid value (invalid value parameter.)" << std::endl; + break; + } + case rocsparse_status_arch_mismatch: { + std::cerr << "arch mismatch (device arch is not supported.)" << std::endl; + break; + } + case rocsparse_status_zero_pivot: { + std::cerr << "zero pivot (encountered zero pivot.)" << std::endl; + break; + } + case rocsparse_status_not_initialized: { + std::cerr << "not initialized (decriptor has not been initialized.)" << std::endl; + break; + } + case rocsparse_status_type_mismatch: { + std::cerr << "type mismatch (index types do not match.)" << std::endl; + break; + } + case rocsparse_status_requires_sorted_storage: { + std::cerr << "requires sorted storage (sorted storage required.)" << std::endl; + break; + } + case rocsparse_status_thrown_exception: { + std::cerr << "thrown exception (exception being thrown.)" << std::endl; + break; + } + default: { + std::cerr << "Not a known status enum" << std::endl; + break; + } + } + exit(1); + } + } + + uint64_t calcNNZC() { + uint64_t nnzSoFar = 0; + for (size_t row = 0; row < m_ + 1; row++) { + for (size_t col = 0; col < n_; col++) { + for (size_t entry = 0; entry < k_; entry++) { + if (A_[row * k_ + entry] != 0 && B_[entry * n_ + col] != 0) { + nnzSoFar++; + break; + } + } + } + } + if (print_) std::cout << "Calculated nnzC = " << nnzSoFar << std::endl; + return nnzSoFar; + } + + void printMatrices() { + std::cout << "================ Printing matrices ================" << std::endl; + std::cout << "A matrix dense:" << std::endl; + for (size_t i = 0; i < m_; i++) { + for (size_t j = 0; j < k_; j++) { + std::cout << A_[i * k_ + j] << " "; + } + std::cout << std::endl; + } + std::cout << "A matrix CSR:" << std::endl; + std::cout << "\tRows: "; + for (size_t i = 0; i < m_ + 1; i++) { + std::cout << A_rows_[i] << " "; + } + std::cout << std::endl; + std::cout << "\tCols: "; + for (size_t i = 0; i < A_nnz_; i++) { + std::cout << A_cols_[i] << " "; + } + std::cout << std::endl; + std::cout << "\tVals: "; + for (size_t i = 0; i < A_nnz_; i++) { + std::cout << A_vals_[i] << " "; + } + std::cout << std::endl; + + std::cout << "---------------------------------------------------" << std::endl; + + std::cout << "B matrix dense:" << std::endl; + for (size_t i = 0; i < k_; i++) { + for (size_t j = 0; j < n_; j++) { + std::cout << B_[i * n_ + j] << " "; + } + std::cout << std::endl; + } + std::cout << "B matrix CSR:" << std::endl; + std::cout << "\tRows: "; + for (size_t i = 0; i < k_ + 1; i++) { + std::cout << B_rows_[i] << " "; + } + std::cout << std::endl; + std::cout << "\tCols: "; + for (size_t i = 0; i < B_nnz_; i++) { + std::cout << B_cols_[i] << " "; + } + std::cout << std::endl; + std::cout << "\tVals: "; + for (size_t i = 0; i < B_nnz_; i++) { + std::cout << B_vals_[i] << " "; + } + std::cout << std::endl; + + std::cout << "---------------------------------------------------" << std::endl; + + std::cout << "D matrix CSR:" << std::endl; + std::cout << "\tRows: "; + for (size_t i = 0; i < m_ + 1; i++) { + std::cout << D_rows_[i] << " "; + } + std::cout << std::endl; + std::cout << "\tCols: "; + for (size_t i = 0; i < D_nnz_; i++) { + std::cout << D_cols_[i] << " "; + } + std::cout << std::endl; + std::cout << "\tVals: "; + for (size_t i = 0; i < D_nnz_; i++) { + std::cout << D_vals_[i] << " "; + } + std::cout << std::endl; + + std::cout << "================ Matrices printed! ================" << std::endl; + } + + bool print_ = true; + bool initialised_ = false; + bool firstRun_ = false; + + rocsparse_handle handle_; + rocsparse_operation operation_; + rocsparse_status status_; + rocsparse_indextype_ index_; + rocsparse_index_base base_; + rocsparse_matrix_type type_; + rocsparse_datatype dataType_; + rocsparse_spgemm_stage stage_; + rocsparse_spgemm_alg algorithm_; + + rocsparse_spmat_descr description_A_, description_B_, description_C_, description_D_; + + int64_t* A_rows_; + int64_t* A_cols_; + T* A_vals_; + int64_t* B_rows_; + int64_t* B_cols_; + T* B_vals_; + + int64_t* A_rows_device_; + int64_t* A_cols_device_; + T* A_vals_device_; + int64_t* B_rows_device_; + int64_t* B_cols_device_; + T* B_vals_device_; + int64_t* C_rows_device_; + int64_t* C_cols_device_; + T* C_vals_device_; + + int64_t* D_rows_; + int64_t* D_cols_; + T* D_vals_; + int64_t* D_rows_device_; + int64_t* D_cols_device_; + T* D_vals_device_; + int64_t D_nnz_ = 1; + + int gpuDevice_; + hipStream_t stream_; + + const T alpha = ALPHA; + const T beta = BETA; +}; +} // namespace gpu + +#endif diff --git a/src/main.cc b/src/main.cc index 2d046e3..b98350c 100644 --- a/src/main.cc +++ b/src/main.cc @@ -3,16 +3,56 @@ int iters = 10; int startDim = 1; int upperLimit = 128; +int step = 1; +double sparsity = 0.99; +// GEMV kernels +bool doSgemv = true; +bool doDgemv = true; +// Sparse GEMV kernels +bool doSspmdnv = true; +bool doDspmdnv = true; +// GEMM kernels +bool doSgemm = true; +bool doDgemm = true; +// Sparse GEMM kernels +bool doSspmdnm = true; +bool doDspmdnm = true; +// Sparse-sparse matrix multiplication kernels +bool doSspmspm = true; +bool doDspmspm = true; bool doCpu = CPU_ENABLED; bool doGpu = GPU_ENABLED; +matrixType type = matrixType::random; + std::string CSV_DIR = "CSV_Results"; int main(int argc, char** argv) { getParameters(argc, argv); printBenchmarkConfig(iters, upperLimit); +#ifdef CPU_ARMPL + if (doSspmdnm || doDspmdnm) { + std::cout << "WARNING - ArmPL does not currently provide a Sparse Matrix-Dense Matrix kernel. Disabling Sparse Matrix-Dense Matrix tests." << std::endl; + doSspmdnm = false; + doDspmdnm = false; + } +#endif + +#ifdef CPU_NVPL + if (doSspmdnm || doDspmdnm) { + std::cout << "WARNING - NVPL does not currently provide a Sparse Matrix-Dense Matrix kernel. Disabling Sparse Matrix-Dense Matrix tests." << std::endl; + doSspmdnm = false; + doDspmdnm = false; + } + if (doSspmspm || doDspmspm) { + std::cout << "WARNING - NVPL does not currently provide a Sparse Matrix-Sparse Matrix kernel. Disabling Sparse Matrix-Sparse Matrix tests." << std::endl; + doSspmspm = false; + doDspmspm = false; + } +#endif + if (!doCpu && !doGpu) { std::cout << "Finished!" << std::endl; exit(0); @@ -28,41 +68,106 @@ int main(int argc, char** argv) { std::cout << "All results will be saved in CSV files at '" << absPath << "'" << std::endl << std::endl; +// -------- GEMV -------- + // Single-Precision GEMV + if (doSgemv) { + std::cout << std::endl << "Comparing SGEMV Kernels:" << std::endl; + doGemv sgemv(std::string(absPath), iters, startDim, upperLimit, + step, doCpu, doGpu); + sgemv.collectData(); + std::cout << "Finished!" << std::endl; + } + + // Double-Precision GEMV + if (doDgemv) { + std::cout << std::endl << "Comparing DGEMV Kernels:" << std::endl; + doGemv dgemv(std::string(absPath), iters, startDim, upperLimit, + step, doCpu, doGpu); + dgemv.collectData(); + std::cout << "Finished!" << std::endl; + } + +// // -------- GEMM -------- +// // Single-Precision GEMM + if (doSgemm) { + std::cout << std::endl << "Comparing SGEMM Kernels:" << std::endl; + doGemm sgemm(std::string(absPath), iters, startDim, upperLimit, + step, doCpu, doGpu); + sgemm.collectData(); + std::cout << "Finished!" << std::endl; + } + + // Double-Precision GEMM + if (doDgemm) { + std::cout << std::endl << "Comparing DGEMM Kernels:" << std::endl; + doGemm dgemm(std::string(absPath), iters, startDim, upperLimit, + step, doCpu, doGpu); + dgemm.collectData(); + std::cout << "Finished!" << std::endl; + } - // -------- GEMM -------- - // SGEMM Comparison - std::cout << std::endl << "Comparing SGEMM Kernels:" << std::endl; - doGemm sgemm(std::string(absPath), iters, startDim, upperLimit, doCpu, - doGpu); - sgemm.collectData(); - std::cout << "Finished!" << std::endl; - - // DGEMM Comparison - std::cout << std::endl << "Comparing DGEMM Kernels:" << std::endl; - doGemm dgemm(std::string(absPath), iters, startDim, upperLimit, doCpu, - doGpu); - dgemm.collectData(); - std::cout << "Finished!" << std::endl; - - // -------- GEMV -------- - // SGEMV Comparison - std::cout << std::endl << "Comparing SGEMV Kernels:" << std::endl; - doGemv sgemv(std::string(absPath), iters, startDim, upperLimit, doCpu, - doGpu); - sgemv.collectData(); - std::cout << "Finished!" << std::endl; - - // DGEMV Comparison - std::cout << std::endl << "Comparing DGEMV Kernels:" << std::endl; - doGemv dgemv(std::string(absPath), iters, startDim, upperLimit, doCpu, - doGpu); - dgemv.collectData(); - std::cout << "Finished!" << std::endl; + // -------- SPMDNV -------- + // Single-Precision Sparse Matrix-Dense Vector + if (doSspmdnv) { + std::cout << std::endl << "Comparing SSPMDNV Kernels:" << std::endl; + doSpmdnv sspmdnv(std::string(absPath), iters, startDim, upperLimit, + step, sparsity, type, doCpu, doGpu); + sspmdnv.collectData(); + std::cout << "Finished!" << std::endl; + } + + // Double-Precision Sparse Matrix-Dense Vector + if (doDspmdnv) { + std::cout << std::endl << "Comparing DSPMDNV Kernels:" << std::endl; + doSpmdnv dspmdnv(std::string(absPath), iters, startDim, upperLimit, + step, sparsity, type, doCpu, doGpu); + dspmdnv.collectData(); + std::cout << "Finished!" << std::endl; + } + + // // -------- SPMDNM -------- + // // Single-Precision Sparse Matrix-Dense Matrix + if (doSspmdnm) { + std::cout << std::endl << "Comparing SSpMDnM Kernels:" << std::endl; + doSpmdnm sspmdnm(std::string(absPath), iters, startDim, upperLimit, + step, sparsity, type, doCpu, doGpu); + sspmdnm.collectData(); + std::cout << "Finished!" << std::endl; + } + + // Double-Precision Sparse Matrix-Dense Matrix + if (doDspmdnm) { + std::cout << std::endl << "Comparing DSpMDnM Kernels:" << std::endl; + doSpmdnm dspmdnm(std::string(absPath), iters, startDim, upperLimit, + step, sparsity, type, doCpu, doGpu); + dspmdnm.collectData(); + std::cout << "Finished!" << std::endl; + } + + // -------- SPMSPM -------- + // Single-Precision Sparse Matrix-Sparse Matrix + if (doSspmspm) { + std::cout << std::endl << "Comparing SSpMSpM Kernels:" << std::endl; + doSpmspm sspmspm(std::string(absPath), iters, startDim, upperLimit, + step, sparsity, type, doCpu, doGpu); + sspmspm.collectData(); + std::cout << "Finished!" << std::endl; + } + + // Double-Precision Sparse Matrix-Sparse Matrix + if (doDspmspm) { + std::cout << std::endl << "Comparing DSpMSpM Kernels:" << std::endl; + doSpmspm dspmspm(std::string(absPath), iters, startDim, upperLimit, + step, sparsity, type, doCpu, doGpu); + dspmspm.collectData(); + std::cout << "Finished!" << std::endl; + } free(absPath); return 0; } + void printBenchmarkConfig(const int iters, const int upperLimit) { std::string cpuEnabledStr = (doCpu) ? "True" : "False"; std::string gpuEnabledStr = (doGpu) ? "True" : "False"; @@ -71,16 +176,33 @@ void printBenchmarkConfig(const int iters, const int upperLimit) { (getenv("BLIS_NUM_THREADS") != NULL) ? atoi(getenv("BLIS_NUM_THREADS")) : 1; #else - (getenv("OMP_NUM_THREADS") != NULL) ? atoi(getenv("OMP_NUM_THREADS")) : 1; + (getenv("OMP_NUM_THREADS") != nullptr) ? atoi(getenv("OMP_NUM_THREADS")) : 1; #endif const char* ompProcBind = - (getenv("OMP_PROC_BIND") != NULL) ? getenv("OMP_PROC_BIND") : "Not Set"; + (getenv("OMP_PROC_BIND") != nullptr) ? getenv("OMP_PROC_BIND") : "Not " + "Set"; const char* ompPlaces = - (getenv("OMP_PLACES") != NULL) ? getenv("OMP_PLACES") : "Not Set"; + (getenv("OMP_PLACES") != nullptr) ? getenv("OMP_PLACES") : "Not Set"; + const char* matrixType; + switch (type) { + case matrixType::rmat: + matrixType = "rMAT"; + break; + case matrixType::random: + matrixType = "random"; + break; + case matrixType::finiteElements: + matrixType = "finiteElements"; + break; + default: + matrixType = "Unknown"; + break; + } std::cout << "GPU BLAS Offload Benchmark:" << std::endl; std::cout << "\tIterations per Kernel: " << iters << std::endl; std::cout << "\tStarting Problem Dimension: " << startDim << std::endl; std::cout << "\tMaximum Problem Dimension: " << upperLimit << std::endl; + std::cout << "\tSparse Matrix Type: " << matrixType << std::endl; std::cout << "\tCPU Kernels Enabled: " << cpuEnabledStr << std::endl; std::cout << "\tCPU Library: " << CPU_LIB_NAME << std::endl; std::cout << "\tGPU Kernels Enabled: " << gpuEnabledStr << std::endl; @@ -112,7 +234,7 @@ int parseInt(const char* str) { return strlen(next) ? -1 : value; } -void getParameters(int argc, char* argv[]) { +void getParameters(int argc, char** argv) { for (int i = 1; i < argc; i++) { if (!strcmp(argv[i], "--iterations") || !strcmp(argv[i], "-i")) { if (++i >= argc || (iters = parseInt(argv[i])) < 0) { @@ -137,17 +259,62 @@ void getParameters(int argc, char* argv[]) { << std::endl; exit(1); } + } else if (!strcmp(argv[i], "--step")) { + if (++i >= argc || (step = parseInt(argv[i])) < 0) { + std::cout << "ERROR - Invalid dimension step size" << std::endl; + exit(1); + } } else if (!strcmp(argv[i], "--no_cpu")) { doCpu = false; } else if (!strcmp(argv[i], "--no_gpu")) { doGpu = false; - } else if (!strcmp(argv[i], "--output_dir") || !strcmp(argv[i], "-o")) { - if (++i >= argc) { - std::cout << "ERROR - Invalid output directory" << std::endl; + } else if (!strcmp(argv[i], "--kernels") || !strcmp(argv[i], "-k")) { + std::string kernelList = argv[++i]; + doSgemm = (kernelList.find("sgemm") != std::string::npos); + doDgemm = (kernelList.find("dgemm") != std::string::npos); + doSspmdnm = (kernelList.find("sspmdnm") != std::string::npos); + doDspmdnm = (kernelList.find("dspmdnm") != std::string::npos); + doSspmspm = (kernelList.find("sspmspm") != std::string::npos); + doDspmspm = (kernelList.find("dspmspm") != std::string::npos); + doSgemv = (kernelList.find("sgemv") != std::string::npos); + doDgemv = (kernelList.find("dgemv") != std::string::npos); + doSspmdnv = (kernelList.find("sspmdnv") != std::string::npos); + doDspmdnv = (kernelList.find("dspmdnv") != std::string::npos); + + if (!doSgemv && !doSspmdnv && !doSgemm && !doSspmdnm && !doSspmspm && + !doDgemv && !doDspmdnv && !doDgemm && !doDspmdnm && !doDspmspm) { + std::cout << "ERROR - no implemented kernels in list" << std::endl; exit(1); } else { CSV_DIR = argv[i]; } + } else if (!strcmp(argv[i], "--sparsity")) { + if (++i >= argc || (sparsity = std::stod(argv[i])) < 0 || + sparsity >= 1.00) { + std::cout << "ERROR - Invalid sparsity value" << std::endl; + exit(1); + } + } else if (!strcmp(argv[i], "--matrix_type") || !strcmp(argv[i], "-t")) { + if (++i >= argc) { + std::cout << "ERROR - No matrix type specified" << std::endl; + exit(1); + } else if (!strcmp(argv[i], "rmat")) { + type = matrixType::rmat; + } else if (!strcmp(argv[i], "random")) { + type = matrixType::random; + } else if (!strcmp(argv[i], "finiteElements")) { + type = matrixType::finiteElements; + } else { + std::cout << "ERROR - Unrecognized matrix type '" << argv[i] + << "'" << std::endl; + exit(1); + } + } else if (!strcmp(argv[i], "--output_dir") || !strcmp(argv[i], "-o")) { + if (++i >= argc) { + std::cout << "ERROR - No output directory specified" << std::endl; + exit(1); + } + CSV_DIR = argv[i]; } else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h")) { std::cout << std::endl; std::cout << "Usage: ./gpu-blob [OPTIONS]" << std::endl << std::endl; @@ -158,19 +325,35 @@ void getParameters(int argc, char* argv[]) { << std::endl; std::cout << " --no_gpu Disable all GPU kernel Runs" << std::endl; - std::cout - << " -o --output_dir The CSV file output directory" - << std::endl; + std::cout << " -o --output_dir The CSV file output directory" + << std::endl; std::cout << " -i --iterations I Repeat each kernel I times " - "(default: " - << iters << ")" << std::endl; + "(default: " << iters << ")" + << std::endl; std::cout << " -s --start_dimension S First value of M, N, K is S " - "(default: " - << startDim << ")" << std::endl; + "(default: " << startDim << ")" + << std::endl; + std::cout << " --step St Step size between values of M, N, K" + "(default: " << step << ")" + << std::endl; std::cout << " -d --dimension_limit D Max value of M, N, K is D " - "(default: " - << upperLimit << ")" << std::endl; - std::cout << std::endl; + "(default: " << upperLimit << ")" + << std::endl; + std::cout << " -k --kernels Comma-separated list of " + "kernels to be run. Options are sgemm, dgemm, sspmdnm, " + "dspmdnm, sspmspm, dspmspm, sgemv, dgemv, sspmdnv, dspmdnv " + "(default: `-k sgemm,dgemm,sspmdnm,dspmdnm,sspmspm,dspmspm," + "sgemv,dgemv,sspmdnv,dspmdnv`)" + << std::endl; + std::cout << " --sparsity Sp Sparsity value, between 0 " + "and 1 (double), to be used by the sparse BLAS kernels. " + "Matrices with be generated with this sparsity value. " + "Defaults to 0.99" + << std::endl; + std::cout << " -t --matrix_type M Type of sparse matrix to use." + ". Only applies to sparse kernels. Options are rmat, random" + ", finiteElements (default -t random)" + << std::endl; exit(0); } else { std::cout << "Unrecognized argument '" << argv[i] << "' (try '--help')" @@ -178,4 +361,4 @@ void getParameters(int argc, char* argv[]) { exit(1); } } -} \ No newline at end of file +}