diff --git a/gpu-burn/BurnKernel.cpp b/gpu-burn/BurnKernel.cpp index bd538a4..1e9b10e 100644 --- a/gpu-burn/BurnKernel.cpp +++ b/gpu-burn/BurnKernel.cpp @@ -11,6 +11,9 @@ #include "common.h" #include "BurnKernel.h" +#include "rocblas.h" + +#define EPSILOND 0.0000001f // --------------------------------------------------------------------------- namespace gpuburn { @@ -26,48 +29,55 @@ BurnKernel::BurnKernel(int hipDevice) : mHipDevice(hipDevice), mRunKernel(false), mDeviceAdata(NULL), mDeviceBdata(NULL), mDeviceCdata(NULL) { + + err_num = 0; + } BurnKernel::~BurnKernel() { - if (mBurnThread) + if (mBurnThread){ mBurnThread->join(); + } - if (mDeviceAdata) + if (mDeviceAdata){ hipFree(mDeviceAdata); + } - if (mDeviceBdata) + if (mDeviceBdata){ hipFree(mDeviceBdata); + } - if (mDeviceCdata) + if (mDeviceCdata){ hipFree(mDeviceCdata); + } } // --------------------------------------------------------------------------- -extern "C" __global__ void hip_sgemm_kernel(hipLaunchParm lp, const int M, - const int N, const int K, - const float alpha, - float *A, const int lda, float *B, - const int ldb, const float beta, - float *C, const int ldc) + + +extern "C" __global__ void hip_compare_kernel(double *C, int *faultyElems, size_t iters) { //column major NN - size_t idx_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; - size_t idx_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; - size_t dim_x = hipGridDim_x * hipBlockDim_x; + size_t idx_x = blockIdx.x * blockDim.x + threadIdx.x; + size_t idx_y = blockIdx.y * blockDim.y + threadIdx.y; + size_t dim_x = gridDim.x * blockDim.x; + size_t myIdx = idx_y * dim_x + idx_x; - float local_c = beta * C[myIdx]; - for(int k = 0; k < K; k++) { - local_c += alpha * A[ idx_y + k * K] * B[ idx_x * K + k]; - } + size_t iterStep = hipBlockDim_x*hipBlockDim_y*hipGridDim_x*hipGridDim_y; - C[myIdx] = local_c; + int myFaulty = 0; + for (size_t i = 1; i < iters; ++i){ + if(fabs(C[myIdx] - C[myIdx + iterStep]) > EPSILOND){ + myFaulty++; + } + } + atomicAdd(faultyElems, myFaulty); } -// --------------------------------------------------------------------------- int BurnKernel::Init() { @@ -82,14 +92,19 @@ int BurnKernel::Init() mHostBdata[i] = (rand() % 1000000)/100000.0; } + size_t freeMem = getAvailableMemory() * cUseMem; - size_t matrixSizeBytes = sizeof(float)*cMatrixSize; + //size_t matrixSizeBytes = sizeof(float)*cMatrixSize; + size_t matrixSizeBytes = sizeof(double)*cMatrixSize; mNumIterations = (freeMem - (matrixSizeBytes*2))/matrixSizeBytes; checkError(hipMalloc((void**)&mDeviceAdata, matrixSizeBytes), "Alloc A"); checkError(hipMalloc((void**)&mDeviceBdata, matrixSizeBytes), "Alloc B"); checkError(hipMalloc((void**)&mDeviceCdata, matrixSizeBytes*mNumIterations), "Alloc C"); + //rocky added for acc check: + checkError(hipMalloc(&d_faultyElemData, sizeof(int)), "faulty data"); + checkError(hipMemcpy(mDeviceAdata, mHostAdata, matrixSizeBytes, hipMemcpyHostToDevice), "A -> device"); checkError(hipMemcpy(mDeviceBdata, mHostBdata, matrixSizeBytes, hipMemcpyHostToDevice), "B -> device"); checkError(hipMemset(mDeviceCdata, 0, matrixSizeBytes*mNumIterations), "C memset"); @@ -142,25 +157,37 @@ int BurnKernel::runComputeKernel() { int err = 0; + for (int i = 0; mRunKernel && i < mNumIterations; ++i) { - hipLaunchKernel( - /* Launch params */ - HIP_KERNEL_NAME(hip_sgemm_kernel), - dim3(cRowSize/cBlockSize, cRowSize/cBlockSize, 1), - dim3(cBlockSize,cBlockSize,1), 0, 0, - /* Kernel params */ - cRowSize, cRowSize, cRowSize, cAlpha, - mDeviceAdata, cRowSize, - mDeviceBdata, cRowSize, - cBeta, - mDeviceCdata + i*cMatrixSize, - cRowSize); + + double alpha = 1.1; + double beta = 0.0; + + rocblas_handle handle; + rocblas_create_handle(&handle); + rocblas_dgemm(handle, rocblas_operation_none, rocblas_operation_transpose, cRowSize, cRowSize, cRowSize, &alpha, mDeviceAdata, cRowSize, mDeviceBdata,cRowSize , &beta, mDeviceCdata + i*cMatrixSize, cRowSize); } + + checkError(hipDeviceSynchronize(), "Sync"); // rocky added to fix seg fault + + hipLaunchKernelGGL(HIP_KERNEL_NAME(hip_compare_kernel),dim3(cRowSize/cBlockSize, cRowSize/cBlockSize, 1),dim3(cBlockSize,cBlockSize,1), 0, 0, mDeviceCdata, d_faultyElemData, mNumIterations); + + + int *d_faultyElemsHost; + checkError(hipMemcpy(d_faultyElemsHost, d_faultyElemData, sizeof(int), hipMemcpyDeviceToHost), "Read faultyelemdata"); + + err_num += *d_faultyElemsHost; + checkError(hipDeviceSynchronize(), "Sync"); return err; } +int BurnKernel::get_err_num(){ + return err_num; +} + + size_t BurnKernel::getAvailableMemory() { size_t freeMem, totalMem; diff --git a/gpu-burn/BurnKernel.h b/gpu-burn/BurnKernel.h index d8f9b15..52b95e4 100644 --- a/gpu-burn/BurnKernel.h +++ b/gpu-burn/BurnKernel.h @@ -8,6 +8,7 @@ #define GPUBURN_BURNKERNEL_H_ #include +#include // --------------------------------------------------------------------------- namespace gpuburn { @@ -34,21 +35,35 @@ class BurnKernel { */ int stopBurn(); + // rocky: + int get_err_num(); + private: static constexpr int cRandSeed = 10; static constexpr float cUseMem = 0.80; - static constexpr uint32_t cRowSize = 512; + //static constexpr uint32_t cRowSize = 512; + static constexpr uint32_t cRowSize = 8640; // rocky, 20190809 static constexpr uint32_t cMatrixSize = cRowSize * cRowSize; static constexpr uint32_t cBlockSize = 16; static constexpr float cAlpha = 1.0f; static constexpr float cBeta = 0.0f; - float mHostAdata[cMatrixSize]; - float mHostBdata[cMatrixSize]; + //float mHostAdata[cMatrixSize]; + //float mHostBdata[cMatrixSize]; + + double mHostAdata[cMatrixSize]; + double mHostBdata[cMatrixSize]; + //float* mDeviceAdata; + //float* mDeviceBdata; + //float* mDeviceCdata; + + // rocky: + double* mDeviceAdata; + double* mDeviceBdata; + double* mDeviceCdata; - float* mDeviceAdata; - float* mDeviceBdata; - float* mDeviceCdata; + int* d_faultyElemData; + int err_num; bool mRunKernel; int mNumIterations; diff --git a/gpu-burn/Makefile b/gpu-burn/Makefile index 5f81edc..39324e5 100644 --- a/gpu-burn/Makefile +++ b/gpu-burn/Makefile @@ -1,12 +1,12 @@ HIP_PATH ?= /opt/rocm/hip HCC_PATH ?= /opt/rocm/hcc HIP_PLATFORM = $(shell $(HIP_PATH)/bin/hipconfig --platform) -HIP_INCLUDE = -I${HIP_PATH}/include -I${HCC_PATH}/include +HIP_INCLUDE = -I${HIP_PATH}/include -I${HCC_PATH}/include -I/opt/rocm/include BUILD_DIR ?= build HIPCC = ${HIP_PATH}/bin/hipcc -CPPFLAGS = -O3 -LDFLAGS = -lm -lpthread +CPPFLAGS = -lrocblas -L/opt/rocm/lib -I/opt/rocm/include +LDFLAGS = -lm -lrocblas ifeq (${HIP_PLATFORM}, nvcc) CPPFLAGS += -arch=compute_20 diff --git a/gpu-burn/gpuburn.cpp b/gpu-burn/gpuburn.cpp index 2a113b0..6fe15ca 100644 --- a/gpu-burn/gpuburn.cpp +++ b/gpu-burn/gpuburn.cpp @@ -26,11 +26,11 @@ std::vector> genBurnKernels() try { checkError(hipGetDeviceCount(&deviceCount)); - std::cout<<"Total no. of GPUs found: "< kernel(new BurnKernel(i)); @@ -89,10 +89,22 @@ int doBurn(int burnSec) { std::ostringstream msg; msg << "Temps: "; for (auto& monitor : gpuMonitors) { - msg << "[GPU" << monitor->getId() << ": " << monitor->getTemperature() << " C] "; + msg << "[GPU" << monitor->getId() << ":" << monitor->getTemperature() << "C] "; + } + + int cnt = 0; + msg << " Accuracy: "; + int current_err = 0; + for(auto& kernel : burnKernels){ + current_err += kernel->get_err_num(); + msg << "[GPU " << kernel->mHipDevice << " err: " << kernel->get_err_num() << "] " ; + cnt += 1; } + + msg << burnSec << "s\n"; std::cout << msg.str(); + sleep(1); } @@ -100,6 +112,31 @@ int doBurn(int burnSec) { kernel->stopBurn(); } + + // final report, rockyli: + // Tested 2 GPUs: + // GPU 0: FAULTY + // GPU 1: OK + + std::ostringstream rpt; + int gpu_count = 0; + for (auto& kernel : burnKernels) { + gpu_count +=1; + } + rpt << "Tested " << gpu_count << " GPUs:\n"; + for (auto& kernel : burnKernels) { + std::string r; + if(kernel->get_err_num() > 0){ + r = "FAULTY"; + } + else{ + r = "OK"; + } + rpt << "\tGPU " << kernel->mHipDevice << ": " << r << "\n"; + } + std::cout << rpt.str(); + + return 0; } @@ -118,7 +155,6 @@ int main(int argc, char **argv) { std::cerr << "Usage: " << argv[0] << " [-t sec]\n"; return -EINVAL; } - return doBurn(burnSec); }