@@ -53,20 +53,21 @@ void haxpy(int n, half a, const half *x, half *y)
5353 int stride = blockDim .x * gridDim .x ;
5454
5555#if __CUDA_ARCH__ >= 530
56- int n2 = n/2 ;
56+ int n2 = n/2 ;
5757 half2 *x2 = (half2*)x, *y2 = (half2*)y;
5858
59- for (int i = start; i < n2; i+= stride)
59+ for (int i = start; i < n2; i+= stride)
6060 y2[i] = __hfma2 (__halves2half2 (a, a), x2[i], y2[i]);
6161
6262 // first thread handles singleton for odd arrays
63- if (start == 0 && (n%2 ))
64- y[n-1 ] = __hfma (a, x[n-1 ], y[n-1 ]);
63+ if (start == 0 && (n%2 ))
64+ y[n-1 ] = __hfma (a, x[n-1 ], y[n-1 ]);
65+
6566#else
66- for (int i = start; i < n; i+= stride) {
67- y[i] = __float2half (__half2float (a) * __half2float (x[i])
67+ for (int i = start; i < n; i+= stride) {
68+ y[i] = __float2half (__half2float (a) * __half2float (x[i])
6869 + __half2float (y[i]));
69- }
70+ }
7071#endif
7172}
7273
@@ -84,17 +85,17 @@ int main(void) {
8485 y[i] = approx_float_to_half ((float )i);
8586 }
8687
87- const int blockDim = 256 ;
88- const int nBlocks = (n + blockDim - 1 ) / blockDim ;
88+ const int blockSize = 256 ;
89+ const int nBlocks = (n + blockSize - 1 ) / blockSize ;
8990
90- haxpy<<<blockDim , nBlocks >>> (n, a, x, y);
91+ haxpy<<<nBlocks, blockSize >>> (n, a, x, y);
9192
92- checkCuda (cudaDeviceSynchronize ());
93+ // must wait for kernel to finish before CPU accesses
94+ checkCuda (cudaDeviceSynchronize ());
9395
94- for (int i = 0 ; i < n; i++)
95- printf (" %f\n " , half_to_float (y[i]));
96+ for (int i = 0 ; i < n; i++)
97+ printf (" %f\n " , half_to_float (y[i]));
9698
97-
98- return 0 ;
99+ return 0 ;
99100}
100101
0 commit comments