New fp16 HAXPY example

harrism · harrism · commit 4ef2fcc1871f · 2016-10-19T15:12:55.000+11:00
diff --git a/posts/mixed-precision/.gitignore b/posts/mixed-precision/.gitignore
@@ -0,0 +1 @@
+haxpy
diff --git a/posts/mixed-precision/Makefile b/posts/mixed-precision/Makefile
@@ -0,0 +1,37 @@
+# Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+CUDA_ARCH_FLAGS := 
+CC_FLAGS += --std=c++11 $(CUDA_ARCH_FLAGS)
+
+EXE = haxpy
+
+all: $(EXE)
+
+% : %.cu
+	nvcc $< $(CC_FLAGS) $(LIB_FLAGS) -o $@
+
+clean:
+	rm -f $(EXE)
diff --git a/posts/mixed-precision/fp16_conversion.h b/posts/mixed-precision/fp16_conversion.h
@@ -0,0 +1,114 @@
+// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This code modified from the public domain code here: 
+// https://gist.github.com/rygorous/2156668
+// The URL above includes more robust conversion routines
+// that handle Inf and NaN correctly. 
+// 
+// It is recommended to use the more robust versions in production code.
+
+typedef unsigned uint;
+
+union FP32
+{
+    uint u;
+    float f;
+    struct
+    {
+        uint Mantissa : 23;
+        uint Exponent : 8;
+        uint Sign : 1;
+    };
+};
+
+union FP16
+{
+    unsigned short u;
+    struct
+    {
+        uint Mantissa : 10;
+        uint Exponent : 5;
+        uint Sign : 1;
+    };
+};
+
+// Approximate solution. This is faster but converts some sNaNs to
+// infinity and doesn't round correctly. Handle with care.
+// Approximate solution. This is faster but converts some sNaNs to
+// infinity and doesn't round correctly. Handle with care.
+static half approx_float_to_half(float fl)
+{
+    FP32 f32infty = { 255 << 23 };
+    FP32 f16max = { (127 + 16) << 23 };
+    FP32 magic = { 15 << 23 };
+    FP32 expinf = { (255 ^ 31) << 23 };
+    uint sign_mask = 0x80000000u;
+    FP16 o = { 0 };
+
+    FP32 f = *((FP32*)&fl);
+
+    uint sign = f.u & sign_mask;
+    f.u ^= sign;
+
+    if (!(f.f < f32infty.u)) // Inf or NaN
+        o.u = f.u ^ expinf.u;
+    else
+    {
+        if (f.f > f16max.f) f.f = f16max.f;
+        f.f *= magic.f;
+    }
+
+    o.u = f.u >> 13; // Take the mantissa bits
+    o.u |= sign >> 16;
+    return *((half*)&o);
+}
+
+// from half->float code - just for verification.
+static float half_to_float(half hf)
+{
+    FP16 h = *((FP16*)&hf);
+
+    static const FP32 magic = { 113 << 23 };
+    static const uint shifted_exp = 0x7c00 << 13; // exponent mask after shift
+    FP32 o;
+
+    o.u = (h.u & 0x7fff) << 13;     // exponent/mantissa bits
+    uint exp = shifted_exp & o.u;   // just the exponent
+    o.u += (127 - 15) << 23;        // exponent adjust
+
+    // handle exponent special cases
+    if (exp == shifted_exp) // Inf/NaN?
+        o.u += (128 - 16) << 23;    // extra exp adjust
+    else if (exp == 0) // Zero/Denormal?
+    {
+        o.u += 1 << 23;             // extra exp adjust
+        o.f -= magic.f;             // renormalize
+    }
+
+    o.u |= (h.u & 0x8000) << 16;    // sign bit
+    return o.f;
+}
diff --git a/posts/mixed-precision/haxpy.cu b/posts/mixed-precision/haxpy.cu
@@ -0,0 +1,100 @@
+// Copyright (c) 1993-2016, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#include <cstdio>
+#include <cuda_fp16.h>
+#include <assert.h>
+#include "fp16_conversion.h"
+
+// This is a simple example of using FP16 types and arithmetic on
+// GPUs that support it. The code computes an AXPY (A * X + Y) operation
+// on half-precision (FP16) vectors (HAXPY).
+
+// Convenience function for checking CUDA runtime API results
+// can be wrapped around any runtime API call. No-op in release builds.
+inline
+cudaError_t checkCuda(cudaError_t result)
+{
+#if defined(DEBUG) || defined(_DEBUG)
+  if (result != cudaSuccess) {
+    fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
+    assert(result == cudaSuccess);
+  }
+#endif
+  return result;
+}
+
+__global__
+void haxpy(int n, half a, const half *x, half *y)
+{
+	int start = threadIdx.x + blockDim.x * blockIdx.x;
+	int stride = blockDim.x * gridDim.x;
+
+#if __CUDA_ARCH__ >= 530
+	int n2 = n/2;
+	half2 *x2 = (half2*)x, *y2 = (half2*)y;
+
+	for (int i = start; i < n2; i+= stride)
+		y2[i] = __hfma2(__halves2half2(a, a), x2[i], y2[i]);
+
+	// first thread handles singleton for odd arrays
+    if (start == 0 && (n%2))
+    	y[n-1] = __hfma(a, x[n-1], y[n-1]);   
+#else
+    for (int i = start; i < n; i+= stride) {
+    	y[i] = __float2half(__half2float(a) * __half2float(x[i]) 
+    		                                + __half2float(y[i]));
+    }
+#endif
+}
+
+int main(void) {
+	const int n = 100;
+
+	const half a = approx_float_to_half(2.0f);
+
+	half *x, *y;
+	checkCuda(cudaMallocManaged(&x, n * sizeof(half)));
+	checkCuda(cudaMallocManaged(&y, n * sizeof(half)));
+	
+	for (int i = 0; i < n; i++) {
+		x[i] = approx_float_to_half(1.0f);
+		y[i] = approx_float_to_half((float)i);
+	}
+
+	const int blockDim = 256;
+	const int nBlocks = (n + blockDim - 1) / blockDim;
+
+	haxpy<<<blockDim, nBlocks>>>(n, a, x, y);
+
+    checkCuda(cudaDeviceSynchronize());
+    
+    for (int i = 0; i < n; i++)
+    	printf("%f\n", half_to_float(y[i]));
+
+    
+    return 0;
+}
+