diff --git a/gpu-burn/BurnKernel.cpp b/gpu-burn/BurnKernel.cpp
index bd538a4..1e9b10e 100644
--- a/gpu-burn/BurnKernel.cpp
+++ b/gpu-burn/BurnKernel.cpp
@@ -11,6 +11,9 @@
 #include "common.h"
 #include "BurnKernel.h"
 
+#include "rocblas.h"
+
+#define EPSILOND 0.0000001f
 // ---------------------------------------------------------------------------
 namespace gpuburn {
 
@@ -26,48 +29,55 @@ BurnKernel::BurnKernel(int hipDevice)
     : mHipDevice(hipDevice), mRunKernel(false),
     mDeviceAdata(NULL), mDeviceBdata(NULL), mDeviceCdata(NULL)
 {
+
+    err_num = 0;
+
 }
 
 BurnKernel::~BurnKernel()
 {
-    if (mBurnThread)
+    if (mBurnThread){
         mBurnThread->join();
+    }
 
-    if (mDeviceAdata)
+    if (mDeviceAdata){
         hipFree(mDeviceAdata);
+    }
 
-    if (mDeviceBdata)
+    if (mDeviceBdata){
         hipFree(mDeviceBdata);
+    }
 
-    if (mDeviceCdata)
+    if (mDeviceCdata){
         hipFree(mDeviceCdata);
+    }
 }
 
 // ---------------------------------------------------------------------------
 
-extern "C" __global__ void hip_sgemm_kernel(hipLaunchParm lp, const int M,
-                                            const int N, const int K,
-                                            const float alpha,
-                                            float *A, const int lda, float *B,
-                                            const int ldb, const float beta,
-                                            float *C, const int ldc)
+
+
+extern "C" __global__ void hip_compare_kernel(double *C, int *faultyElems, size_t iters) 
 {
         //column major NN
-        size_t idx_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-        size_t idx_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
-        size_t dim_x = hipGridDim_x * hipBlockDim_x;
+        size_t idx_x = blockIdx.x * blockDim.x + threadIdx.x;
+        size_t idx_y = blockIdx.y * blockDim.y + threadIdx.y;
+        size_t dim_x = gridDim.x * blockDim.x;
+
         size_t myIdx = idx_y * dim_x + idx_x;
 
-        float local_c = beta * C[myIdx];
 
-        for(int k = 0; k < K; k++) {
-          local_c += alpha * A[ idx_y + k * K] * B[ idx_x * K + k];
-        }
+	size_t iterStep = hipBlockDim_x*hipBlockDim_y*hipGridDim_x*hipGridDim_y;
 
-        C[myIdx] = local_c;
+        int myFaulty = 0;
+	for (size_t i = 1; i < iters; ++i){
+	        if(fabs(C[myIdx] - C[myIdx + iterStep]) > EPSILOND){
+			myFaulty++;
+		}
+        }
+	atomicAdd(faultyElems, myFaulty);
 }
 
-// ---------------------------------------------------------------------------
 
 int BurnKernel::Init()
 {
@@ -82,14 +92,19 @@ int BurnKernel::Init()
         mHostBdata[i] = (rand() % 1000000)/100000.0;
     }
 
+
     size_t freeMem = getAvailableMemory() * cUseMem;
-    size_t matrixSizeBytes = sizeof(float)*cMatrixSize;
+    //size_t matrixSizeBytes = sizeof(float)*cMatrixSize;
+    size_t matrixSizeBytes = sizeof(double)*cMatrixSize;
     mNumIterations = (freeMem - (matrixSizeBytes*2))/matrixSizeBytes;
 
     checkError(hipMalloc((void**)&mDeviceAdata, matrixSizeBytes), "Alloc A");
     checkError(hipMalloc((void**)&mDeviceBdata, matrixSizeBytes), "Alloc B");
     checkError(hipMalloc((void**)&mDeviceCdata, matrixSizeBytes*mNumIterations), "Alloc C");
 
+    //rocky added for acc check:
+    checkError(hipMalloc(&d_faultyElemData, sizeof(int)), "faulty data");
+
     checkError(hipMemcpy(mDeviceAdata, mHostAdata, matrixSizeBytes, hipMemcpyHostToDevice), "A -> device");
     checkError(hipMemcpy(mDeviceBdata, mHostBdata, matrixSizeBytes, hipMemcpyHostToDevice), "B -> device");
     checkError(hipMemset(mDeviceCdata, 0, matrixSizeBytes*mNumIterations), "C memset");
@@ -142,25 +157,37 @@ int BurnKernel::runComputeKernel()
 {
     int err = 0;
 
+
     for (int i = 0; mRunKernel && i < mNumIterations; ++i) {
-        hipLaunchKernel(
-            /* Launch params */
-            HIP_KERNEL_NAME(hip_sgemm_kernel),
-            dim3(cRowSize/cBlockSize, cRowSize/cBlockSize, 1),
-            dim3(cBlockSize,cBlockSize,1), 0, 0,
-            /* Kernel params */
-            cRowSize, cRowSize, cRowSize, cAlpha,
-            mDeviceAdata, cRowSize,
-            mDeviceBdata, cRowSize,
-            cBeta,
-            mDeviceCdata + i*cMatrixSize,
-            cRowSize);
+
+	double alpha = 1.1;
+	double beta = 0.0;
+
+    	rocblas_handle handle;
+  	rocblas_create_handle(&handle);
+        rocblas_dgemm(handle, rocblas_operation_none, rocblas_operation_transpose, cRowSize, cRowSize, cRowSize, &alpha, mDeviceAdata, cRowSize, mDeviceBdata,cRowSize , &beta, mDeviceCdata + i*cMatrixSize, cRowSize);
     }
+
+    checkError(hipDeviceSynchronize(), "Sync"); // rocky added to fix seg fault
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(hip_compare_kernel),dim3(cRowSize/cBlockSize, cRowSize/cBlockSize, 1),dim3(cBlockSize,cBlockSize,1), 0, 0, mDeviceCdata, d_faultyElemData, mNumIterations);
+
+
+    int *d_faultyElemsHost;
+    checkError(hipMemcpy(d_faultyElemsHost, d_faultyElemData, sizeof(int), hipMemcpyDeviceToHost), "Read faultyelemdata");
+
+    err_num += *d_faultyElemsHost;
+
     checkError(hipDeviceSynchronize(), "Sync");
 
     return err;
 }
 
+int BurnKernel::get_err_num(){
+    return err_num;
+}
+
+
 size_t BurnKernel::getAvailableMemory()
 {
     size_t freeMem, totalMem;
diff --git a/gpu-burn/BurnKernel.h b/gpu-burn/BurnKernel.h
index d8f9b15..52b95e4 100644
--- a/gpu-burn/BurnKernel.h
+++ b/gpu-burn/BurnKernel.h
@@ -8,6 +8,7 @@
 #define GPUBURN_BURNKERNEL_H_
 
 #include <thread>
+#include <map>
 
 // ---------------------------------------------------------------------------
 namespace gpuburn {
@@ -34,21 +35,35 @@ class BurnKernel {
          */
         int stopBurn();
 
+	// rocky:
+	int get_err_num();
+
     private:
         static constexpr int cRandSeed = 10;
         static constexpr float cUseMem = 0.80;
-        static constexpr uint32_t cRowSize = 512;
+        //static constexpr uint32_t cRowSize = 512;
+        static constexpr uint32_t cRowSize = 8640; // rocky, 20190809
         static constexpr uint32_t cMatrixSize = cRowSize * cRowSize;
         static constexpr uint32_t cBlockSize = 16;
         static constexpr float cAlpha = 1.0f;
         static constexpr float cBeta = 0.0f;
 
-        float mHostAdata[cMatrixSize];
-        float mHostBdata[cMatrixSize];
+        //float mHostAdata[cMatrixSize];
+        //float mHostBdata[cMatrixSize];
+
+        double mHostAdata[cMatrixSize];
+        double mHostBdata[cMatrixSize];
+        //float* mDeviceAdata;
+        //float* mDeviceBdata;
+        //float* mDeviceCdata;
+
+	// rocky:
+        double* mDeviceAdata;
+        double* mDeviceBdata;
+        double* mDeviceCdata;
 
-        float* mDeviceAdata;
-        float* mDeviceBdata;
-        float* mDeviceCdata;
+	int* d_faultyElemData;
+	int err_num;
 
         bool mRunKernel;
         int mNumIterations;
diff --git a/gpu-burn/Makefile b/gpu-burn/Makefile
index 5f81edc..39324e5 100644
--- a/gpu-burn/Makefile
+++ b/gpu-burn/Makefile
@@ -1,12 +1,12 @@
 HIP_PATH ?= /opt/rocm/hip
 HCC_PATH ?= /opt/rocm/hcc
 HIP_PLATFORM = $(shell $(HIP_PATH)/bin/hipconfig --platform)
-HIP_INCLUDE = -I${HIP_PATH}/include -I${HCC_PATH}/include
+HIP_INCLUDE = -I${HIP_PATH}/include -I${HCC_PATH}/include -I/opt/rocm/include
 BUILD_DIR ?= build
 
 HIPCC = ${HIP_PATH}/bin/hipcc
-CPPFLAGS = -O3
-LDFLAGS = -lm -lpthread
+CPPFLAGS = -lrocblas -L/opt/rocm/lib -I/opt/rocm/include
+LDFLAGS = -lm -lrocblas
 
 ifeq (${HIP_PLATFORM}, nvcc)
     CPPFLAGS += -arch=compute_20
diff --git a/gpu-burn/gpuburn.cpp b/gpu-burn/gpuburn.cpp
index 2a113b0..6fe15ca 100644
--- a/gpu-burn/gpuburn.cpp
+++ b/gpu-burn/gpuburn.cpp
@@ -26,11 +26,11 @@ std::vector<std::unique_ptr<BurnKernel>> genBurnKernels()
 
     try {
         checkError(hipGetDeviceCount(&deviceCount));
-        std::cout<<"Total no. of GPUs found: "<<deviceCount<<std::endl;
     } catch (std::string e) {
         std::cerr << "Error: couldn't find any HIP devices\n";
     }
 
+
     for (int i =0; i < deviceCount; ++i) {
         try {
             std::unique_ptr<BurnKernel> kernel(new BurnKernel(i));
@@ -89,10 +89,22 @@ int doBurn(int burnSec) {
         std::ostringstream msg;
         msg << "Temps: ";
         for (auto& monitor : gpuMonitors) {
-            msg << "[GPU" << monitor->getId() << ": " << monitor->getTemperature() << " C] ";
+            msg << "[GPU" << monitor->getId() << ":" << monitor->getTemperature() << "C] ";
+        }
+
+	int cnt = 0;
+        msg << " Accuracy: ";
+        int current_err = 0;            
+        for(auto& kernel : burnKernels){
+            current_err += kernel->get_err_num();
+	    msg << "[GPU " << kernel->mHipDevice << " err: " << kernel->get_err_num() << "] " ;
+	    cnt += 1;
         }
+
+
         msg << burnSec << "s\n";
         std::cout << msg.str();
+
         sleep(1);
     }
 
@@ -100,6 +112,31 @@ int doBurn(int burnSec) {
         kernel->stopBurn();
     }
 
+
+    // final report, rockyli:
+    // Tested 2 GPUs:
+    // 		GPU 0: FAULTY
+    // 		GPU 1: OK
+
+    std::ostringstream rpt;
+    int gpu_count = 0;
+    for (auto& kernel : burnKernels) {
+      gpu_count +=1;
+    }
+    rpt << "Tested " << gpu_count << " GPUs:\n";
+    for (auto& kernel : burnKernels) {
+      std::string r;
+      if(kernel->get_err_num() > 0){
+        r = "FAULTY";
+      }
+      else{
+        r = "OK";
+      }
+      rpt << "\tGPU " << kernel->mHipDevice << ": " << r << "\n";
+    }
+    std::cout << rpt.str();
+
+
     return 0;
 }
 
@@ -118,7 +155,6 @@ int main(int argc, char **argv) {
                 std::cerr << "Usage: " << argv[0] << " [-t sec]\n";
                 return -EINVAL;
         }
-
     return doBurn(burnSec);
 }