-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathcudaBlasExample.c
More file actions
129 lines (101 loc) · 3.38 KB
/
cudaBlasExample.c
File metadata and controls
129 lines (101 loc) · 3.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#include <stdlib.h>
#include <sys/time.h>
#include <stdio.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
// compile as:
// export PATH=$PATH:/usr/local/cuda/bin
// nvcc cudaExample.C -I/usr/local/cuda/include -lcublas -o cudaExample
double read_timer() {
struct timeval end;
gettimeofday( &end, NULL );
return end.tv_sec+1.e-6*end.tv_usec;
}
void fillMatrix( double *p, int n ) {
int i;
srand48(0);
for( i = 0; i < n; i++ )
p[i] = 2*drand48()-1;
}
int main( int argc, char **argv ) {
printf("Starting\n");
int size;
cudaError_t cudaStat;
cublasStatus_t stat;
cublasHandle_t handle;
int it;
cublasOperation_t N = 'N';
cublasOperation_t T = 'T';
double one = 1., zero=0.;
for( size = 256; size <= 8192; size*=2 ) {
// allocate memory on host (CPU)
double *A = (double*) malloc( sizeof(double)*size*size );
double *B = (double*) malloc( sizeof(double)*size*size );
cudaDeviceSynchronize();
double tInit = read_timer();
double *dA,*dB;
// allocate memory on device (GPU)
cudaStat = cudaMalloc((void**)&dA, sizeof(double)*size*size);
if(cudaStat != cudaSuccess) {
printf ("device memory allocation failed");
return EXIT_FAILURE;
}
cudaStat = cudaMalloc((void**)&dB, sizeof(double)*size*size);
if(cudaStat != cudaSuccess) {
printf ("device memory allocation failed");
return EXIT_FAILURE;
}
// wait until previous CUDA commands on GPU threads have finished
// this allows us to do the timing correctly
cudaDeviceSynchronize();
double tAlloc = read_timer();
// initialization of CUBLAS
stat = cublasCreate(&handle);
if(stat != CUBLAS_STATUS_SUCCESS) {
printf ("CUBLAS initialization failed\n");
return EXIT_FAILURE;
}
// create our test matrix on the CPU
fillMatrix(B, size*size);
cudaDeviceSynchronize();
double tInit2 = read_timer();
// copy matrix to GPU, with dB the pointer to the object on the GPU
stat = cublasSetMatrix (size, size, sizeof(double), B, size, dB, size);
if(stat != CUBLAS_STATUS_SUCCESS) {
printf ("data download failed");
cudaFree (dB);
cublasDestroy(handle);
return EXIT_FAILURE;
}
cudaDeviceSynchronize();
double tTransferToGPU = read_timer();
// call cublas matrix multiply (dA = dB * dB)
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, size, size, size, &one, dB, size, dB, size, &zero, dA, size );
cudaDeviceSynchronize();
double tMatMult = read_timer();
// transfer matrix back to CPU
stat = cublasGetMatrix (size, size, sizeof(double), dA, size, A, size);
if(stat != CUBLAS_STATUS_SUCCESS) {
printf ("data upload failed");
cudaFree(dA);
cublasDestroy(handle);
return EXIT_FAILURE;
}
cudaDeviceSynchronize();
double tTransferFromGPU = read_timer();
printf("====================================================\n");
printf("Timing results for n = %d\n", size);
printf("GPU memory allocation time: %f\n", tAlloc - tInit);
printf("Transfer to GPU time: %f\n", tTransferToGPU - tInit2);
printf("Matrix multiply time: %f\n", tMatMult - tTransferToGPU);
printf("Transfer from GPU time: %f\n", tTransferFromGPU - tMatMult);
// free memory on GPU and CPU
cudaFree(dA);
cudaFree(dB);
cublasDestroy(handle);
free(A);
free(B);
}
return EXIT_SUCCESS;
}