Skip to content

Commit 4ef2fcc

Browse files
committed
New fp16 HAXPY example
1 parent 379b693 commit 4ef2fcc

File tree

4 files changed

+252
-0
lines changed

4 files changed

+252
-0
lines changed

posts/mixed-precision/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
haxpy

posts/mixed-precision/Makefile

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Redistribution and use in source and binary forms, with or without
4+
# modification, are permitted provided that the following conditions
5+
# are met:
6+
# * Redistributions of source code must retain the above copyright
7+
# notice, this list of conditions and the following disclaimer.
8+
# * Redistributions in binary form must reproduce the above copyright
9+
# notice, this list of conditions and the following disclaimer in the
10+
# documentation and/or other materials provided with the distribution.
11+
# * Neither the name of NVIDIA CORPORATION nor the names of its
12+
# contributors may be used to endorse or promote products derived
13+
# from this software without specific prior written permission.
14+
#
15+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
CUDA_ARCH_FLAGS :=
27+
CC_FLAGS += --std=c++11 $(CUDA_ARCH_FLAGS)
28+
29+
EXE = haxpy
30+
31+
all: $(EXE)
32+
33+
% : %.cu
34+
nvcc $< $(CC_FLAGS) $(LIB_FLAGS) -o $@
35+
36+
clean:
37+
rm -f $(EXE)
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved.
2+
//
3+
// Redistribution and use in source and binary forms, with or without
4+
// modification, are permitted provided that the following conditions
5+
// are met:
6+
// * Redistributions of source code must retain the above copyright
7+
// notice, this list of conditions and the following disclaimer.
8+
// * Redistributions in binary form must reproduce the above copyright
9+
// notice, this list of conditions and the following disclaimer in the
10+
// documentation and/or other materials provided with the distribution.
11+
// * Neither the name of NVIDIA CORPORATION nor the names of its
12+
// contributors may be used to endorse or promote products derived
13+
// from this software without specific prior written permission.
14+
//
15+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16+
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19+
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22+
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23+
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24+
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
27+
// This code modified from the public domain code here:
28+
// https://gist.github.com/rygorous/2156668
29+
// The URL above includes more robust conversion routines
30+
// that handle Inf and NaN correctly.
31+
//
32+
// It is recommended to use the more robust versions in production code.
33+
34+
typedef unsigned uint;
35+
36+
union FP32
37+
{
38+
uint u;
39+
float f;
40+
struct
41+
{
42+
uint Mantissa : 23;
43+
uint Exponent : 8;
44+
uint Sign : 1;
45+
};
46+
};
47+
48+
union FP16
49+
{
50+
unsigned short u;
51+
struct
52+
{
53+
uint Mantissa : 10;
54+
uint Exponent : 5;
55+
uint Sign : 1;
56+
};
57+
};
58+
59+
// Approximate solution. This is faster but converts some sNaNs to
60+
// infinity and doesn't round correctly. Handle with care.
61+
// Approximate solution. This is faster but converts some sNaNs to
62+
// infinity and doesn't round correctly. Handle with care.
63+
static half approx_float_to_half(float fl)
64+
{
65+
FP32 f32infty = { 255 << 23 };
66+
FP32 f16max = { (127 + 16) << 23 };
67+
FP32 magic = { 15 << 23 };
68+
FP32 expinf = { (255 ^ 31) << 23 };
69+
uint sign_mask = 0x80000000u;
70+
FP16 o = { 0 };
71+
72+
FP32 f = *((FP32*)&fl);
73+
74+
uint sign = f.u & sign_mask;
75+
f.u ^= sign;
76+
77+
if (!(f.f < f32infty.u)) // Inf or NaN
78+
o.u = f.u ^ expinf.u;
79+
else
80+
{
81+
if (f.f > f16max.f) f.f = f16max.f;
82+
f.f *= magic.f;
83+
}
84+
85+
o.u = f.u >> 13; // Take the mantissa bits
86+
o.u |= sign >> 16;
87+
return *((half*)&o);
88+
}
89+
90+
// from half->float code - just for verification.
91+
static float half_to_float(half hf)
92+
{
93+
FP16 h = *((FP16*)&hf);
94+
95+
static const FP32 magic = { 113 << 23 };
96+
static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift
97+
FP32 o;
98+
99+
o.u = (h.u & 0x7fff) << 13; // exponent/mantissa bits
100+
uint exp = shifted_exp & o.u; // just the exponent
101+
o.u += (127 - 15) << 23; // exponent adjust
102+
103+
// handle exponent special cases
104+
if (exp == shifted_exp) // Inf/NaN?
105+
o.u += (128 - 16) << 23; // extra exp adjust
106+
else if (exp == 0) // Zero/Denormal?
107+
{
108+
o.u += 1 << 23; // extra exp adjust
109+
o.f -= magic.f; // renormalize
110+
}
111+
112+
o.u |= (h.u & 0x8000) << 16; // sign bit
113+
return o.f;
114+
}

posts/mixed-precision/haxpy.cu

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved.
2+
//
3+
// Redistribution and use in source and binary forms, with or without
4+
// modification, are permitted provided that the following conditions
5+
// are met:
6+
// * Redistributions of source code must retain the above copyright
7+
// notice, this list of conditions and the following disclaimer.
8+
// * Redistributions in binary form must reproduce the above copyright
9+
// notice, this list of conditions and the following disclaimer in the
10+
// documentation and/or other materials provided with the distribution.
11+
// * Neither the name of NVIDIA CORPORATION nor the names of its
12+
// contributors may be used to endorse or promote products derived
13+
// from this software without specific prior written permission.
14+
//
15+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16+
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19+
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22+
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23+
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24+
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
#include <cstdio>
27+
#include <cuda_fp16.h>
28+
#include <assert.h>
29+
#include "fp16_conversion.h"
30+
31+
// This is a simple example of using FP16 types and arithmetic on
32+
// GPUs that support it. The code computes an AXPY (A * X + Y) operation
33+
// on half-precision (FP16) vectors (HAXPY).
34+
35+
// Convenience function for checking CUDA runtime API results
36+
// can be wrapped around any runtime API call. No-op in release builds.
37+
inline
38+
cudaError_t checkCuda(cudaError_t result)
39+
{
40+
#if defined(DEBUG) || defined(_DEBUG)
41+
if (result != cudaSuccess) {
42+
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
43+
assert(result == cudaSuccess);
44+
}
45+
#endif
46+
return result;
47+
}
48+
49+
__global__
50+
void haxpy(int n, half a, const half *x, half *y)
51+
{
52+
int start = threadIdx.x + blockDim.x * blockIdx.x;
53+
int stride = blockDim.x * gridDim.x;
54+
55+
#if __CUDA_ARCH__ >= 530
56+
int n2 = n/2;
57+
half2 *x2 = (half2*)x, *y2 = (half2*)y;
58+
59+
for (int i = start; i < n2; i+= stride)
60+
y2[i] = __hfma2(__halves2half2(a, a), x2[i], y2[i]);
61+
62+
// first thread handles singleton for odd arrays
63+
if (start == 0 && (n%2))
64+
y[n-1] = __hfma(a, x[n-1], y[n-1]);
65+
#else
66+
for (int i = start; i < n; i+= stride) {
67+
y[i] = __float2half(__half2float(a) * __half2float(x[i])
68+
+ __half2float(y[i]));
69+
}
70+
#endif
71+
}
72+
73+
int main(void) {
74+
const int n = 100;
75+
76+
const half a = approx_float_to_half(2.0f);
77+
78+
half *x, *y;
79+
checkCuda(cudaMallocManaged(&x, n * sizeof(half)));
80+
checkCuda(cudaMallocManaged(&y, n * sizeof(half)));
81+
82+
for (int i = 0; i < n; i++) {
83+
x[i] = approx_float_to_half(1.0f);
84+
y[i] = approx_float_to_half((float)i);
85+
}
86+
87+
const int blockDim = 256;
88+
const int nBlocks = (n + blockDim - 1) / blockDim;
89+
90+
haxpy<<<blockDim, nBlocks>>>(n, a, x, y);
91+
92+
checkCuda(cudaDeviceSynchronize());
93+
94+
for (int i = 0; i < n; i++)
95+
printf("%f\n", half_to_float(y[i]));
96+
97+
98+
return 0;
99+
}
100+

0 commit comments

Comments
 (0)