@@ -54,10 +54,10 @@ void haxpy(int n, half a, const half *x, half *y)
5454
5555#if __CUDA_ARCH__ >= 530
5656 int n2 = n/2 ;
57- half2 *x2 = (half2*)x, *y2 = (half2*)y;
57+ half2 *x2 = (half2*)x, *y2 = (half2*)y;
5858
59- for (int i = start; i < n2; i+= stride)
60- y2[i] = __hfma2 (__halves2half2 (a, a), x2[i], y2[i]);
59+ for (int i = start; i < n2; i+= stride)
60+ y2[i] = __hfma2 (__halves2half2 (a, a), x2[i], y2[i]);
6161
6262 // first thread handles singleton for odd arrays
6363 if (start == 0 && (n%2 ))
@@ -66,33 +66,33 @@ void haxpy(int n, half a, const half *x, half *y)
6666#else
6767 for (int i = start; i < n; i+= stride) {
6868 y[i] = __float2half (__half2float (a) * __half2float (x[i])
69- + __half2float (y[i]));
69+ + __half2float (y[i]));
7070 }
7171#endif
7272}
7373
7474int main (void ) {
75- const int n = 100 ;
75+ const int n = 100 ;
7676
77- const half a = approx_float_to_half (2 .0f );
77+ const half a = approx_float_to_half (2 .0f );
7878
79- half *x, *y;
80- checkCuda (cudaMallocManaged (&x, n * sizeof (half)));
81- checkCuda (cudaMallocManaged (&y, n * sizeof (half)));
82-
83- for (int i = 0 ; i < n; i++) {
84- x[i] = approx_float_to_half (1 .0f );
85- y[i] = approx_float_to_half ((float )i);
86- }
79+ half *x, *y;
80+ checkCuda (cudaMallocManaged (&x, n * sizeof (half)));
81+ checkCuda (cudaMallocManaged (&y, n * sizeof (half)));
82+
83+ for (int i = 0 ; i < n; i++) {
84+ x[i] = approx_float_to_half (1 .0f );
85+ y[i] = approx_float_to_half ((float )i);
86+ }
8787
88- const int blockSize = 256 ;
89- const int nBlocks = (n + blockSize - 1 ) / blockSize;
88+ const int blockSize = 256 ;
89+ const int nBlocks = (n + blockSize - 1 ) / blockSize;
9090
91- haxpy<<<nBlocks, blockSize>>> (n, a, x, y);
91+ haxpy<<<nBlocks, blockSize>>> (n, a, x, y);
9292
9393 // must wait for kernel to finish before CPU accesses
9494 checkCuda (cudaDeviceSynchronize ());
95-
95+
9696 for (int i = 0 ; i < n; i++)
9797 printf (" %f\n " , half_to_float (y[i]));
9898
0 commit comments