Kernel-Performance-Prediction/vecAdd.cu at main · Spielerr/Kernel-Performance-Prediction · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <time.h>
#include <sys/time.h>
#include <unistd.h>


__global__ void addvector(int *, int *, int *, int);

int main(int argc, char *argv[])
{

    int i;
    int num = 0; // number of elements in the arrays
    int * a, *b, *c; // arrays at host
    int * ad, *bd, *cd; // arrays at device
    int THREADS = 0; // user decides number of threads per block

    if(argc != 3){
        printf("usage: addvec numelements num_threads\n");
        printf("cpu_or_gpu:  0 = CPU, 1  = GPU\n");
        exit(1);
    }

    num = atoi(argv[1]);
    THREADS = atoi(argv[2]);

    a = (int *)malloc(num*sizeof(int));
    if(!a){
        printf("Cannot allocate array a with %d elements\n", num);
        exit(1);
    }


    b = (int *)malloc(num*sizeof(int));
    if(!b){
        printf("Cannot allocate array b with %d elements\n", num);
        exit(1);
    }


    c = (int *)malloc(num*sizeof(int));
    if(!c){
        printf("Cannot allocate array c with %d elements\n", num);
        exit(1);
    }


    //Fill out arrays a and b with some random numbers
    srand(time(0));
    for( i = 0; i < num; i++)
    {
        a[i] = rand() % num;
        b[i] = rand() % num;
    }

    //Now zero C[] in preparation for GPU version
    for( i = 0; i < num; i++)
        c[i] = 0;


    int numblocks;
    int threadsperblock;

    if( (num % THREADS) == 0 )
        numblocks =num / THREADS ;
    else
        numblocks = (num/THREADS)>0? (num/THREADS)+1:1 ;
    threadsperblock = THREADS;

    // printf("GPU: %d blocks of %d threads each\n", numblocks, threadsperblock);

    //assume a block can have THREADS threads
    dim3 grid(numblocks, 1, 1);
    dim3 block(threadsperblock, 1, 1);

    cudaMalloc((void **)&ad, num*sizeof(int));
    if(!ad)
    { printf("cannot allocated array ad of %d elements\n", num);
    exit(1);
    }


    cudaMalloc((void **)&bd, num*sizeof(int));
    if(!bd)
    {printf("cannot allocated array bd of %d elements\n", num);
    exit(1);
    }


    cudaMalloc((void **)&cd, num*sizeof(int));
    if(!cd)
    {printf("cannot allocated array cd of %d elements\n", num);
    exit(1);
    }

    struct timeval t1, t2;
    gettimeofday(&t1, 0);

    //move a and b to the device
    cudaMemcpy(ad, a, num*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(bd, b, num*sizeof(int), cudaMemcpyHostToDevice);

    //Launch the kernel
    addvector<<<numblocks , threadsperblock>>>(ad, bd, cd, num);
    cudaDeviceSynchronize();

    //bring data back
    cudaMemcpy(c, cd, num*sizeof(int), cudaMemcpyDeviceToHost);
    gettimeofday(&t2, 0);
    double time = (1000000.0*(t2.tv_sec-t1.tv_sec) + t2.tv_usec-t1.tv_usec)/1000.0;
    printf("%d:%.3f\n", num, time);

    free(a);
    free(b);
    free(c);

    cudaFree(ad);
    cudaFree(bd);
    cudaFree(cd);


}


__global__  void addvector(int * ad, int * bd, int *cd, int n)
{
   int index;

   index = (blockIdx.x * blockDim.x) + threadIdx.x;

   if (index < n ) {
   clock_t start = clock64();
    clock_t now;
    for (;;) {
      now = clock64();
      clock_t cycles = now > start ? now - start : now + (0xffffffff - start);
      if (cycles >= 10000000) {
        break;
      }
    }
         cd[index] = ad[index] + bd[index];
  }

}