Skip to content

Commit f32282c

Browse files
committed
Fixed for sm_60
1 parent 4ef2fcc commit f32282c

File tree

2 files changed

+18
-17
lines changed

2 files changed

+18
-17
lines changed

posts/mixed-precision/Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
2424
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2525
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26-
CUDA_ARCH_FLAGS :=
26+
CUDA_ARCH_FLAGS := -arch=sm_60
2727
CC_FLAGS += --std=c++11 $(CUDA_ARCH_FLAGS)
2828

2929
EXE = haxpy
@@ -34,4 +34,4 @@ all: $(EXE)
3434
nvcc $< $(CC_FLAGS) $(LIB_FLAGS) -o $@
3535

3636
clean:
37-
rm -f $(EXE)
37+
rm -f $(EXE)

posts/mixed-precision/haxpy.cu

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -53,20 +53,21 @@ void haxpy(int n, half a, const half *x, half *y)
5353
int stride = blockDim.x * gridDim.x;
5454

5555
#if __CUDA_ARCH__ >= 530
56-
int n2 = n/2;
56+
int n2 = n/2;
5757
half2 *x2 = (half2*)x, *y2 = (half2*)y;
5858

59-
for (int i = start; i < n2; i+= stride)
59+
for (int i = start; i < n2; i+= stride)
6060
y2[i] = __hfma2(__halves2half2(a, a), x2[i], y2[i]);
6161

6262
// first thread handles singleton for odd arrays
63-
if (start == 0 && (n%2))
64-
y[n-1] = __hfma(a, x[n-1], y[n-1]);
63+
if (start == 0 && (n%2))
64+
y[n-1] = __hfma(a, x[n-1], y[n-1]);
65+
6566
#else
66-
for (int i = start; i < n; i+= stride) {
67-
y[i] = __float2half(__half2float(a) * __half2float(x[i])
67+
for (int i = start; i < n; i+= stride) {
68+
y[i] = __float2half(__half2float(a) * __half2float(x[i])
6869
+ __half2float(y[i]));
69-
}
70+
}
7071
#endif
7172
}
7273

@@ -84,17 +85,17 @@ int main(void) {
8485
y[i] = approx_float_to_half((float)i);
8586
}
8687

87-
const int blockDim = 256;
88-
const int nBlocks = (n + blockDim - 1) / blockDim;
88+
const int blockSize = 256;
89+
const int nBlocks = (n + blockSize - 1) / blockSize;
8990

90-
haxpy<<<blockDim, nBlocks>>>(n, a, x, y);
91+
haxpy<<<nBlocks, blockSize>>>(n, a, x, y);
9192

92-
checkCuda(cudaDeviceSynchronize());
93+
// must wait for kernel to finish before CPU accesses
94+
checkCuda(cudaDeviceSynchronize());
9395

94-
for (int i = 0; i < n; i++)
95-
printf("%f\n", half_to_float(y[i]));
96+
for (int i = 0; i < n; i++)
97+
printf("%f\n", half_to_float(y[i]));
9698

97-
98-
return 0;
99+
return 0;
99100
}
100101

0 commit comments

Comments
 (0)