From b928ca019ea061cb9ebaca2dd81450aa24f25268 Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Fri, 30 Sep 2016 02:03:56 -0700 Subject: [PATCH 01/13] temporarily force pseudo-fp16 mode (HAS_HALF_INSTRUCTIONS=FALSE) for Pascal: https://github.com/torch/cutorch/issues/520 --- init.c | 4 ++++ lib/THC/THCHalf.cu | 4 ++++ lib/THC/THCHalf.h | 16 ++++++++++++++-- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/init.c b/init.c index 82f9a823..c92982cc 100644 --- a/init.c +++ b/init.c @@ -1032,6 +1032,10 @@ int luaopen_libcutorch(lua_State *L) #endif lua_setfield(L, -2, "hasHalf"); + /* true fp16 vs pseudo-fp16 mode: this one is per device */ + lua_pushboolean(L, THC_nativeHalfInstructions(state)); + lua_setfield(L, -2, "hasHalfInstructions"); + /* store gpu driver version in field */ int driverVersion; THCudaCheck(cudaDriverGetVersion(&driverVersion)); diff --git a/lib/THC/THCHalf.cu b/lib/THC/THCHalf.cu index 7777bf76..f2484d4a 100644 --- a/lib/THC/THCHalf.cu +++ b/lib/THC/THCHalf.cu @@ -121,10 +121,14 @@ half THC_float2half(float a) } THC_EXTERNC int THC_nativeHalfInstructions(THCState *state) { +#if CUDA_HALF_INSTRUCTIONS cudaDeviceProp* prop = THCState_getCurrentDeviceProperties(state); // CC 5.3+ return (prop->major > 5 || (prop->major == 5 && prop->minor == 3)); +#else + return false; +#endif } diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h index afa83e47..b360d948 100644 --- a/lib/THC/THCHalf.h +++ b/lib/THC/THCHalf.h @@ -8,9 +8,21 @@ #define CUDA_HALF_TENSOR 1 #endif +/* This define forces use of 32-bit float math on 16-bit float type 'half' (a.k.a. "pseudo-fp16 mode") + even if native harware support is available. + This makes difference for Pascal (6.x) cards only: Maxwell (5.x) cards always run 'half' in pseudo mode. + !!! Uncomment on your own risk !!! + Native fp16 operations may in fact run slower than pseudo-fp16 on your system at the moment + (especially if the bulk of your code is in CUDNN and not Cutorch). +*/ + +#define FORCE_PSEUDO_FP16 1 + +#ifndef FORCE_PSEUDO_FP16 /* Kernel side: Native fp16 ALU instructions are available if we have this: */ -#if defined(CUDA_HALF_TENSOR) && (CUDA_VERSION >= 8000) && (__CUDA_ARCH__ >= 530) -#define CUDA_HALF_INSTRUCTIONS 1 +# if defined(CUDA_HALF_TENSOR) && (CUDA_VERSION >= 8000) && (__CUDA_ARCH__ >= 530) +# define CUDA_HALF_INSTRUCTIONS 1 +# endif #endif #ifdef CUDA_HALF_TENSOR From a80d28bbc90b0aae3bd8335d3dae2a8956dcecb6 Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Wed, 12 Oct 2016 17:41:43 -0700 Subject: [PATCH 02/13] checkpoint --- lib/THC/THCHalf.h | 63 +++++++++++++++++++++++++++++++++--- lib/THC/THCNumerics.cuh | 72 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 130 insertions(+), 5 deletions(-) diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h index 76271bcc..616ec62e 100644 --- a/lib/THC/THCHalf.h +++ b/lib/THC/THCHalf.h @@ -26,18 +26,71 @@ #endif #ifdef CUDA_HALF_TENSOR - #include #include -THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len); -THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len); -THC_API half THC_float2half(float a); -THC_API float THC_half2float(half a); +/* CPU emulation */ +THC_EXTERNC half THC_float2half(float a); +THC_EXTERNC float THC_half2float(half a); + +#if defined (__CUDA_ARCH__) +# define THC_FLOAT_TO_HALF(x) __float2half((float)x) +# define THC_HALF_TO_FLOAT(x) __half2float((float)x) +#else +# define THC_FLOAT_TO_HALF(x) THC_float2half((float)x) +# define THC_HALF_TO_FLOAT(x) THC_half2float((float)x) +#endif /* Check for native fp16 support on the current device (CC 5.3+) */ THC_EXTERNC int THC_nativeHalfInstructions(THCState *state); +__host__ __device__ __forceinline__ bool operator==(const half& a, const half& b) { + return a.x == b.x; +} + +__host__ __device__ __forceinline__ bool operator!=(const half& a, const half& b) { + return a.x != b.x; +} + #endif /* CUDA_HALF_TENSOR */ +#ifdef __CUDA_ARCH__ +// +// host (CPU) routines +// +THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, long len); +THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, long len); + +/// `half` has some type conversion issues associated with it, since it +/// is a struct without a constructor/implicit conversion constructor. +/// We use this to convert scalar values to the given type that the +/// tensor expects. + +template +struct ScalarConvert { + static inline __host__ __device__ Out to(const In& v) { return Out(v); } +}; + +template +struct ScalarConvert { + static __host__ __device__ __forceinline__ Out to(const half& v) { + return (Out) THC_HALF_TO_FLOAT(v); + } +}; + +template +struct ScalarConvert { + static __host__ __device__ __forceinline__ half to(const In& v) { + return THC_FLOAT_TO_HALF(v); + } +}; + +template <> +struct ScalarConvert { + static __host__ __device__ __forceinline__ half to(const half& v) { + return v; + } +}; +#endif /* CUDA */ + #endif diff --git a/lib/THC/THCNumerics.cuh b/lib/THC/THCNumerics.cuh index a9d78978..09443605 100644 --- a/lib/THC/THCNumerics.cuh +++ b/lib/THC/THCNumerics.cuh @@ -26,6 +26,9 @@ struct THCNumerics { static inline __host__ __device__ bool ne(unsigned char a, unsigned char b) { return a != b; } static inline __host__ __device__ unsigned char add(unsigned char a, unsigned char b) { return a + b; } + static inline __host__ __device__ unsigned char mul(unsigned char a, unsigned char b) { return a * b; } + static inline __host__ __device__ unsigned char sub(unsigned char a, unsigned char b) { return a - b; } + static inline __host__ __device__ unsigned char div(unsigned char a, unsigned char b) { return a / b; } static inline __host__ __device__ unsigned char abs(unsigned char a) { return abs(a); } }; @@ -42,6 +45,9 @@ struct THCNumerics { static inline __host__ __device__ bool ne(char a, char b) { return a != b; } static inline __host__ __device__ char add(char a, char b) { return a + b; } + static inline __host__ __device__ char mul(char a, char b) { return a * b; } + static inline __host__ __device__ char sub(char a, char b) { return a - b; } + static inline __host__ __device__ char div(char a, char b) { return a / b; } static inline __host__ __device__ char abs(char a) { return abs(a); } }; @@ -58,6 +64,9 @@ struct THCNumerics { static inline __host__ __device__ bool ne(short a, short b) { return a != b; } static inline __host__ __device__ short add(short a, short b) { return a + b; } + static inline __host__ __device__ short mul(short a, short b) { return a * b; } + static inline __host__ __device__ short sub(short a, short b) { return a - b; } + static inline __host__ __device__ short div(short a, short b) { return a / b; } static inline __host__ __device__ short abs(short a) { return abs(a); } }; @@ -74,6 +83,9 @@ struct THCNumerics { static inline __host__ __device__ bool ne(int a, int b) { return a != b; } static inline __host__ __device__ int add(int a, int b) { return a + b; } + static inline __host__ __device__ int mul(int a, int b) { return a * b; } + static inline __host__ __device__ int sub(int a, int b) { return a - b; } + static inline __host__ __device__ int div(int a, int b) { return a / b; } static inline __host__ __device__ int abs(int a) { return ::abs(a); } }; @@ -90,6 +102,9 @@ struct THCNumerics { static inline __host__ __device__ bool ne(long a, long b) { return a != b; } static inline __host__ __device__ long add(long a, long b) { return a + b; } + static inline __host__ __device__ long mul(long a, long b) { return a * b; } + static inline __host__ __device__ long sub(long a, long b) { return a - b; } + static inline __host__ __device__ long div(long a, long b) { return a / b; }; static inline __host__ __device__ long abs(long a) { return labs(a); } }; @@ -435,6 +450,55 @@ struct THCNumerics { return THC_float2half(THC_half2float(a) + THC_half2float(b)); #endif } + + static inline __host__ __device__ half div(half a, half b) { +#ifdef __CUDA_ARCH__ + float fa = __half2float(a); + float fb = __half2float(b); + return __float2half( fa / fb ); +#else // __CUDA_ARCH__ + return THC_float2half(THC_half2float(a) / THC_half2float(b)); +#endif + } + + static inline __host__ __device__ half mul(half a, half b) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return __hmul(a, b); +#else + float fa = __half2float(a); + float fb = __half2float(b); + return __float2half( fa * fb ); +#endif +#else // __CUDA_ARCH__ + return THC_float2half(THC_half2float(a) * THC_half2float(b)); +#endif + } + + static inline __host__ __device__ half sub(half a, half b) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return __hsub(a, b); +#else + float fa = __half2float(a); + float fb = __half2float(b); + return __float2half( fa - fb ); +#endif +#else // __CUDA_ARCH__ + return THC_float2half(THC_half2float(a) - THC_half2float(b)); +#endif + } + + static inline __host__ __device__ half pow(half a, half b) { +#ifdef __CUDA_ARCH__ + float fa = __half2float(a); + float fb = __half2float(b); + return __float2half(powf(fa, fb)); +#else // __CUDA_ARCH__ + return THC_float2half(powf(THC_half2float(a), THC_half2float(b))); +#endif + } + }; #endif @@ -475,6 +539,10 @@ struct THCNumerics { static inline __host__ __device__ float frac (float a) { return a - truncf(a); } static inline __host__ __device__ float cinv (float a) { return 1.0f / a; } static inline __host__ __device__ float add (float a, float b) { return a + b; } + static inline __host__ __device__ float div (float a, float b) { return a / b; } + static inline __host__ __device__ float mul (float a, float b) { return a * b; } + static inline __host__ __device__ float sub (float a, float b) { return a - b; } + static inline __host__ __device__ float pow (float a, float b) { return powf(a, b); } }; template <> @@ -514,6 +582,10 @@ struct THCNumerics { static inline __host__ __device__ double frac (double a) { return a - ::trunc(a); } static inline __host__ __device__ double cinv (double a) { return 1.0 / a; } static inline __host__ __device__ double add (double a, double b) { return a + b; } + static inline __host__ __device__ double div (double a, double b) { return a / b; } + static inline __host__ __device__ double mul (double a, double b) { return a * b; } + static inline __host__ __device__ double sub (double a, double b) { return a - b; } + static inline __host__ __device__ double pow (double a, double b) { return ::pow(a, b); } }; /// `half` has some type conversion issues associated with it, since it From ac964ee15f1cf434bdcb674958abaecc5018e7ba Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Fri, 14 Oct 2016 01:03:47 -0700 Subject: [PATCH 03/13] checkpoint --- lib/THC/THCHalf.cu | 103 ----- lib/THC/THCHalf.h | 87 ++-- lib/THC/THCNumerics.cuh | 690 ++++++----------------------- lib/THC/THCTensorMathPointwise.cuh | 39 +- 4 files changed, 191 insertions(+), 728 deletions(-) diff --git a/lib/THC/THCHalf.cu b/lib/THC/THCHalf.cu index 46092b73..4f317102 100644 --- a/lib/THC/THCHalf.cu +++ b/lib/THC/THCHalf.cu @@ -29,106 +29,3 @@ void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len) { #endif in, in + len, out, __half2floatOp()); } - -float THC_half2float(half a) -{ - unsigned int bits = a.x & 0x7fff; - unsigned int sign = a.x & 0x8000; - unsigned int exp = a.x & 0x7c00; - - bits <<= 13; - sign <<= 16; - - bits += 0x38000000U; - - // flush denormals to 0 - bits = (exp == 0 ? 0 : bits) | sign; - - union { - float f; - unsigned int v; - } conv; - conv.v = bits; - - return conv.f; -} - -/* - Copyright (c) 2015, Norbert Juffa - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -half THC_float2half(float a) -{ - uint32_t ia; - uint16_t ir; - memcpy(&ia, &a, sizeof(float)); - - ir = (ia >> 16) & 0x8000; - if ((ia & 0x7f800000) == 0x7f800000) { - if ((ia & 0x7fffffff) == 0x7f800000) { - ir |= 0x7c00; /* infinity */ - } else { - ir = 0x7fff; /* canonical NaN */ - } - } else if ((ia & 0x7f800000) >= 0x33000000) { - int shift = (int)((ia >> 23) & 0xff) - 127; - if (shift > 15) { - ir |= 0x7c00; /* infinity */ - } else { - ia = (ia & 0x007fffff) | 0x00800000; /* extract mantissa */ - if (shift < -14) { /* denormal */ - ir |= ia >> (-1 - shift); - ia = ia << (32 - (-1 - shift)); - } else { /* normal */ - ir |= ia >> (24 - 11); - ia = ia << (32 - (24 - 11)); - ir = ir + ((14 + shift) << 10); - } - /* IEEE-754 round to nearest of even */ - if ((ia > 0x80000000) || ((ia == 0x80000000) && (ir & 1))) { - ir++; - } - } - } - - half ret; - memcpy(&ret, &ir, sizeof(half)); - return ret; -} - -THC_EXTERNC int THC_nativeHalfInstructions(THCState *state) { -#if CUDA_HALF_INSTRUCTIONS - cudaDeviceProp* prop = - THCState_getCurrentDeviceProperties(state); - - // CC 5.3+ - return (prop->major > 5 || - (prop->major == 5 && prop->minor == 3)); -#else - return false; -#endif -} diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h index 616ec62e..11c3bb94 100644 --- a/lib/THC/THCHalf.h +++ b/lib/THC/THCHalf.h @@ -1,12 +1,12 @@ #ifndef THC_HALF_CONVERSION_INC -#define THC_HALF_CONVERSION_INC +# define THC_HALF_CONVERSION_INC -#include "THCGeneral.h" +# include "THCGeneral.h" /* We compile with CudaHalfTensor support if we have this: */ -#if CUDA_VERSION >= 7050 || CUDA_HAS_FP16 -#define CUDA_HALF_TENSOR 1 -#endif +# if CUDA_VERSION >= 7050 || CUDA_HAS_FP16 +# define CUDA_HALF_TENSOR 1 +# endif /* This define forces use of 32-bit float math on 16-bit float type 'half' (a.k.a. "pseudo-fp16 mode") even if native harware support is available. @@ -16,30 +16,54 @@ (especially if the bulk of your code is in CUDNN and not Cutorch). */ -#define FORCE_PSEUDO_FP16 1 +# define FORCE_PSEUDO_FP16 1 -#ifndef FORCE_PSEUDO_FP16 +# ifndef FORCE_PSEUDO_FP16 /* Kernel side: Native fp16 ALU instructions are available if we have this: */ -# if defined(CUDA_HALF_TENSOR) && (CUDA_VERSION >= 8000) && (__CUDA_ARCH__ >= 530) -# define CUDA_HALF_INSTRUCTIONS 1 +# if defined(CUDA_HALF_TENSOR) && (CUDA_VERSION >= 8000) && (__CUDA_ARCH__ >= 530) +# define CUDA_HALF_INSTRUCTIONS 1 +# endif # endif -#endif -#ifdef CUDA_HALF_TENSOR -#include -#include +# ifdef CUDA_HALF_TENSOR + +# include +# include -/* CPU emulation */ -THC_EXTERNC half THC_float2half(float a); +/* CPU conversion methods, scalar */ +THC_EXTERNC half THC_float2half(float a); THC_EXTERNC float THC_half2float(half a); -#if defined (__CUDA_ARCH__) -# define THC_FLOAT_TO_HALF(x) __float2half((float)x) -# define THC_HALF_TO_FLOAT(x) __half2float((float)x) -#else -# define THC_FLOAT_TO_HALF(x) THC_float2half((float)x) -# define THC_HALF_TO_FLOAT(x) THC_half2float((float)x) -#endif +// +// Vector conversion routines, using Thrust +// +THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, long len); +THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, long len); + +# if defined (__CUDA_ARCH__) +/* use instrintic functons defined for device only in cuda_fp16.h */ +# define THC_FLOAT_TO_HALF(x) __float2half((float)x) +# define THC_HALF_TO_FLOAT(x) __half2float((float)x) +# else +/* use host conversion functions */ +# define THC_FLOAT_TO_HALF(x) THC_float2half((float)x) +# define THC_HALF_TO_FLOAT(x) THC_half2float((float)x) +# endif + +/* Basic wrapper for 'half */ +struct Half: public half { + const half& val() const { return *this; } + half& val() { return *this; } + const half& operator half() const { return val(); } + Half(const half& v): half(v) {} + Half(const Half& v): half(v.val) {} +}; + + +/* "pseudo-fp16" type: 16-bit storage, float math */ +struct PseudoHalf: public Half { + PseudoHalf(const Half& v): Half(v.val) {} +}; /* Check for native fp16 support on the current device (CC 5.3+) */ THC_EXTERNC int THC_nativeHalfInstructions(THCState *state); @@ -52,15 +76,6 @@ __host__ __device__ __forceinline__ bool operator!=(const half& a, const half& b return a.x != b.x; } -#endif /* CUDA_HALF_TENSOR */ - -#ifdef __CUDA_ARCH__ -// -// host (CPU) routines -// -THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, long len); -THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, long len); - /// `half` has some type conversion issues associated with it, since it /// is a struct without a constructor/implicit conversion constructor. /// We use this to convert scalar values to the given type that the @@ -68,12 +83,12 @@ THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, long len); template struct ScalarConvert { - static inline __host__ __device__ Out to(const In& v) { return Out(v); } + static inline __host__ __device__ const Out& to(const In& v) { return Out(v); } }; template struct ScalarConvert { - static __host__ __device__ __forceinline__ Out to(const half& v) { + static __host__ __device__ __forceinline__ const Out& to(const half& v) { return (Out) THC_HALF_TO_FLOAT(v); } }; @@ -87,10 +102,10 @@ struct ScalarConvert { template <> struct ScalarConvert { - static __host__ __device__ __forceinline__ half to(const half& v) { + static __host__ __device__ __forceinline__ const half& to(const half& v) { return v; } }; -#endif /* CUDA */ -#endif +# endif /* CUDA_HALF_TENSOR */ +#endif /* THC_HALF_CONVERSION_INC */ diff --git a/lib/THC/THCNumerics.cuh b/lib/THC/THCNumerics.cuh index 09443605..5817d59d 100644 --- a/lib/THC/THCNumerics.cuh +++ b/lib/THC/THCNumerics.cuh @@ -2,630 +2,208 @@ #define THC_NUMERICS_INC #include -#include +#include #include "THCHalf.h" -/// Class for numeric limits of the particular data type, which -/// includes support for `half`. -/// Unfortunately since `half` does not have a constructor, these have -/// to be expressed as functions (either that or non-const statics). +using std::numeric_limits; + +template +struct THCNumericsBase { +}; + template -struct THCNumerics { +struct THC_math_traits +{ + typedef T storage_type; + /* type value should be converted to before doing math on it. + For most types, MathType==StorageType. + */ + typedef T math_type; + /* type of expression , like (a*b). Usually == MathType */ + typedef T expr_type; }; +/* default handling for bare half is pseudo */ template <> -struct THCNumerics { - static inline __host__ __device__ unsigned char min() { return 0; } - static inline __host__ __device__ unsigned char max() { return UCHAR_MAX; } - - static inline __host__ __device__ bool lt(unsigned char a, unsigned char b) { return a < b; } - static inline __host__ __device__ bool le(unsigned char a, unsigned char b) { return a <= b; } - static inline __host__ __device__ bool gt(unsigned char a, unsigned char b) { return a > b; } - static inline __host__ __device__ bool ge(unsigned char a, unsigned char b) { return a >= b; } - static inline __host__ __device__ bool eq(unsigned char a, unsigned char b) { return a == b; } - static inline __host__ __device__ bool ne(unsigned char a, unsigned char b) { return a != b; } - - static inline __host__ __device__ unsigned char add(unsigned char a, unsigned char b) { return a + b; } - static inline __host__ __device__ unsigned char mul(unsigned char a, unsigned char b) { return a * b; } - static inline __host__ __device__ unsigned char sub(unsigned char a, unsigned char b) { return a - b; } - static inline __host__ __device__ unsigned char div(unsigned char a, unsigned char b) { return a / b; } - static inline __host__ __device__ unsigned char abs(unsigned char a) { return abs(a); } -}; +struct THC_math_traits +{ + typedef half storage_type; + typedef float math_type; + typedef float expr_type; +} template <> -struct THCNumerics { - static inline __host__ __device__ char min() { return CHAR_MIN; } - static inline __host__ __device__ char max() { return CHAR_MAX; } - - static inline __host__ __device__ bool lt(char a, char b) { return a < b; } - static inline __host__ __device__ bool le(char a, char b) { return a <= b; } - static inline __host__ __device__ bool gt(char a, char b) { return a > b; } - static inline __host__ __device__ bool ge(char a, char b) { return a >= b; } - static inline __host__ __device__ bool eq(char a, char b) { return a == b; } - static inline __host__ __device__ bool ne(char a, char b) { return a != b; } - - static inline __host__ __device__ char add(char a, char b) { return a + b; } - static inline __host__ __device__ char mul(char a, char b) { return a * b; } - static inline __host__ __device__ char sub(char a, char b) { return a - b; } - static inline __host__ __device__ char div(char a, char b) { return a / b; } - static inline __host__ __device__ char abs(char a) { return abs(a); } -}; +struct THC_math_traits +{ + typedef half storage_type; + typedef float math_type; + typedef float expr_type; +} template <> -struct THCNumerics { - static inline __host__ __device__ short min() { return SHRT_MIN; } - static inline __host__ __device__ short max() { return SHRT_MAX; } - - static inline __host__ __device__ bool lt(short a, short b) { return a < b; } - static inline __host__ __device__ bool le(short a, short b) { return a <= b; } - static inline __host__ __device__ bool gt(short a, short b) { return a > b; } - static inline __host__ __device__ bool ge(short a, short b) { return a >= b; } - static inline __host__ __device__ bool eq(short a, short b) { return a == b; } - static inline __host__ __device__ bool ne(short a, short b) { return a != b; } - - static inline __host__ __device__ short add(short a, short b) { return a + b; } - static inline __host__ __device__ short mul(short a, short b) { return a * b; } - static inline __host__ __device__ short sub(short a, short b) { return a - b; } - static inline __host__ __device__ short div(short a, short b) { return a / b; } - static inline __host__ __device__ short abs(short a) { return abs(a); } +struct THC_math_traits +{ + typedef half storage_type; + typedef half math_type; + typedef half expr_type; +} + +/// Class for numeric limits of the particular data type, which +/// includes support for `half`. +template +struct THCNumericsBase { + + typedef THC_math_traits traits; + typedef typename traits::storage_type storage_type; + typedef typename traits::math_type math_type; + typedef typename traits::expr_type expr_type; + + static const math_type one = 1; + static const math_type zero = 0; + + static __host__ __device__ __forceinline__ const math_type& m_(const storage_type& a) { + return ScalarConvert::to(a); + } + static __host__ __device__ __forceinline__ const expr_type& e_(const math_type& a) { + return ScalarConvert::to(a); + } + + static inline __host__ __device__ expr_type min() { return e_(numeric_limits::min()); } + static inline __host__ __device__ expt_type max() { return e_(numeric_limits::max()); } + + static inline __host__ __device__ bool lt(const storage_type& a, const storage_type& b) { return m_(a) < m_(b); } + static inline __host__ __device__ bool le(const storage_type& a, const storage_type& b) { return m_(a) <= m_(b); } + static inline __host__ __device__ bool gt(const storage_type& a, const storage_type& b) { return m_(a) > m_(b); } + static inline __host__ __device__ bool ge(const storage_type& a, const storage_type& b) { return m_(a) >= m_(b); } + static inline __host__ __device__ bool eq(const storage_type& a, const storage_type& b) { return m_(a) == m_(b); } + static inline __host__ __device__ bool ne(const storage_type& a, const storage_type& b) { return m_(a) != m_(b); } + + static inline __host__ __device__ const expr_type& add(const storage_type& a, const storage_type& b) { return e_(m_(a) + m_(b)); } + static inline __host__ __device__ const expr_type& mul(const storage_type& a, const storage_type& b) { return e_(m_(a) * m_(b)); } + static inline __host__ __device__ const expr_type& sub(const storage_type& a, const storage_type& b) { return e_(m_(a) - m_(b)); } + static inline __host__ __device__ const expr_type& div(const storage_type& a, const storage_type& b) { return e_(m_(a) / m_(b)); } + static inline __host__ __device__ const expr_type& abs(const storage_type& a) { return e_(abs(m_(a))); } }; -template <> -struct THCNumerics { - static inline __host__ __device__ int min() { return INT_MIN; } - static inline __host__ __device__ int max() { return INT_MAX; } - - static inline __host__ __device__ bool lt(int a, int b) { return a < b; } - static inline __host__ __device__ bool le(int a, int b) { return a <= b; } - static inline __host__ __device__ bool gt(int a, int b) { return a > b; } - static inline __host__ __device__ bool ge(int a, int b) { return a >= b; } - static inline __host__ __device__ bool eq(int a, int b) { return a == b; } - static inline __host__ __device__ bool ne(int a, int b) { return a != b; } - - static inline __host__ __device__ int add(int a, int b) { return a + b; } - static inline __host__ __device__ int mul(int a, int b) { return a * b; } - static inline __host__ __device__ int sub(int a, int b) { return a - b; } - static inline __host__ __device__ int div(int a, int b) { return a / b; } - static inline __host__ __device__ int abs(int a) { return ::abs(a); } +template +struct THCNumericsBase: public THCNumericsBase { + + static const math_type one = 1.0; + static const math_type zero = 0.; + + static inline __host__ __device__ expr_type exp (const storage_type& a) { return e_(::exp(m_(a)); } + static inline __host__ __device__ expr_type log (const storage_type& a) { return e_(::log(m_(a))); } + static inline __host__ __device__ expr_type log1p(const storage_type& a) { return e_(::log1p(m_(a))); } + static inline __host__ __device__ expr_type cos (const storage_type& a) { return e_(::cos(m_(a))); } + static inline __host__ __device__ expr_type sin (const storage_type& a) { return e_(::sin(m_(a))); } + static inline __host__ __device__ expr_type sqrt (const storage_type& a) { return e_(::sqrt(m_(a))); } + static inline __host__ __device__ expr_type rsqrt(const storage_type& a) { return e_(::rsqrt(m_(a))); } + static inline __host__ __device__ expr_type ceil (const storage_type& a) { return e_(::ceil(m_(a))); } + static inline __host__ __device__ expr_type floor(const storage_type& a) { return e_(::floor(m_(a))); } + static inline __host__ __device__ expr_type trunc(const storage_type& a) { return e_(::trunc(m_(a))); } + static inline __host__ __device__ expr_type neg (const storage_type& a) { return e_(-m_(a)); } + static inline __host__ __device__ expr_type acos (const storage_type& a) { return e_(::acos(m_(a))); } + static inline __host__ __device__ expr_type cosh (const storage_type& a) { return e_(::cosh(m_(a))); } + static inline __host__ __device__ expr_type acosh(const storage_type& a) { return e_(::acosh(m_(a))); } + static inline __host__ __device__ expr_type asin (const storage_type& a) { return e_(::asin(m_(a))); } + static inline __host__ __device__ expr_type sinh (const storage_type& a) { return e_(::sinh(m_(a))); } + static inline __host__ __device__ expr_type asinh(const storage_type& a) { return e_(::asinh(m_(a))); } + static inline __host__ __device__ expr_type tan (const storage_type& a) { return e_(::tan(m_(a))); } + static inline __host__ __device__ expr_type atan (const storage_type& a) { return e_(::atan(m_(a))); } + static inline __host__ __device__ expr_type tanh (const storage_type& a) { return e_(::tanh(m_(a))); } + static inline __host__ __device__ expr_type abs (const storage_type& a) { return e_(::abs(m_(a))); } + static inline __host__ __device__ expr_type round(const storage_type& a) { return e_(::round(m_(a))); } + static inline __host__ __device__ expr_type frac (const storage_type& a) { return e_(m_(a) - ::trunc(m_(a))); } + static inline __host__ __device__ expr_type cinv (const storage_type& a) { return e_(one / m_(m_(a)))); } + static inline __host__ __device__ expr_type pow (const storage_type& a, T b) { return e_(::pow(a, b)); } }; +template +struct THCNumerics: public THCNumericsBase::is_integer> { +}; + +/* do we need this ? */ template <> -struct THCNumerics { - static inline __host__ __device__ long min() { return LONG_MIN; } - static inline __host__ __device__ long max() { return LONG_MAX; } - - static inline __host__ __device__ bool lt(long a, long b) { return a < b; } - static inline __host__ __device__ bool le(long a, long b) { return a <= b; } - static inline __host__ __device__ bool gt(long a, long b) { return a > b; } - static inline __host__ __device__ bool ge(long a, long b) { return a >= b; } - static inline __host__ __device__ bool eq(long a, long b) { return a == b; } - static inline __host__ __device__ bool ne(long a, long b) { return a != b; } - - static inline __host__ __device__ long add(long a, long b) { return a + b; } - static inline __host__ __device__ long mul(long a, long b) { return a * b; } - static inline __host__ __device__ long sub(long a, long b) { return a - b; } - static inline __host__ __device__ long div(long a, long b) { return a / b; }; +struct THCNumerics : public THCNumericsBase { static inline __host__ __device__ long abs(long a) { return labs(a); } }; + #ifdef CUDA_HALF_TENSOR + template <> -struct THCNumerics { - static inline __host__ __device__ half min() { half h; h.x = 0xfbff; return h; } - static inline __host__ __device__ half max() { half h; h.x = 0x7bff; return h; } +struct THCNumerics: public THCNumericsBase { +}; - static inline __host__ __device__ bool lt(half a, half b) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS +template <> +struct THCNumerics: public THCNumericsBase { + static inline __host__ __device__ bool lt(const half& a, const half& b) { return __hlt(a, b); -#else - float fa = __half2float(a); - float fb = __half2float(b); - return fa < fb; -#endif -#else // __CUDA_ARCH__ - return THC_half2float(a) < THC_half2float(b); -#endif } - - static inline __host__ __device__ bool le(half a, half b) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static inline __host__ __device__ bool le(const half& a, const half& b) { return __hle(a, b); -#else - float fa = __half2float(a); - float fb = __half2float(b); - return fa <= fb; -#endif -#else // __CUDA_ARCH__ - return THC_half2float(a) <= THC_half2float(b); -#endif } - static inline __host__ __device__ bool gt(half a, half b) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static inline __host__ __device__ bool gt(const half& a, const half& b) { return __hgt(a, b); -#else - float fa = __half2float(a); - float fb = __half2float(b); - return fa > fb; -#endif -#else // __CUDA_ARCH__ - return THC_half2float(a) > THC_half2float(b); -#endif } - static inline __host__ __device__ bool ge(half a, half b) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static inline __host__ __device__ bool ge(const half& a, const half& b) { return __hge(a, b); -#else - float fa = __half2float(a); - float fb = __half2float(b); - return fa >= fb; -#endif -#else // __CUDA_ARCH__ - return THC_half2float(a) >= THC_half2float(b); -#endif } - static inline __host__ __device__ bool eq(half a, half b) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static inline __host__ __device__ bool eq(const half& a, const half& b) { return __heq(a, b); -#else - float fa = __half2float(a); - float fb = __half2float(b); - return fa == fb; -#endif -#else // __CUDA_ARCH__ - return THC_half2float(a) == THC_half2float(b); -#endif } - static inline __host__ __device__ bool ne(half a, half b) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static inline __host__ __device__ bool ne(const half& a, const half& b) { return __hne(a, b); -#else - float fa = __half2float(a); - float fb = __half2float(b); - return fa != fb; -#endif -#else // __CUDA_ARCH__ - return THC_half2float(a) != THC_half2float(b); -#endif } - - static inline __host__ __device__ half exp(half a) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static inline __host__ __device__ half exp(const half& a) { return hexp(a); -#else - float fa = __half2float(a); - return __float2half(expf(fa)); -#endif -#else // __CUDA_ARCH__ - return THC_float2half(expf(THC_half2float(a))); -#endif } - - static inline __host__ __device__ half log(half a) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static inline __host__ __device__ half log(const half& a) { return hlog(a); -#else - float fa = __half2float(a); - return __float2half(logf(fa)); -#endif -#else // __CUDA_ARCH__ - return THC_float2half(logf(THC_half2float(a))); -#endif } - - static inline __host__ __device__ half log1p(half a) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - return __float2half(log1pf(fa)); -#else // __CUDA_ARCH__ - return THC_float2half(log1pf(THC_half2float(a))); -#endif - } - - static inline __host__ __device__ half cos(half a) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static inline __host__ __device__ half cos(const half& a) { return hcos(a); -#else - float fa = __half2float(a); - return __float2half(cosf(fa)); -#endif -#else // __CUDA_ARCH__ - return THC_float2half(cosf(THC_half2float(a))); -#endif } - - static inline __host__ __device__ half sin(half a) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static inline __host__ __device__ half sin(const half& a) { return hsin(a); -#else - float fa = __half2float(a); - return __float2half(sinf(fa)); -#endif -#else // __CUDA_ARCH__ - return THC_float2half(sinf(THC_half2float(a))); -#endif } - - static inline __host__ __device__ half sqrt(half a) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static inline __host__ __device__ half sqrt(const half& a) { return hsqrt(a); -#else - float fa = __half2float(a); - return __float2half(sqrtf(fa)); -#endif -#else // __CUDA_ARCH__ - return THC_float2half(sqrtf(THC_half2float(a))); -#endif } - - static inline __host__ __device__ half rsqrt(half a) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static inline __host__ __device__ half rsqrt(const half& a) { return hrsqrt(a); -#else - float fa = __half2float(a); - return __float2half(rsqrtf(fa)); -#endif -#else // __CUDA_ARCH__ - return THC_float2half(rsqrtf(THC_half2float(a))); -#endif } - - static inline __host__ __device__ half ceil(half a) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static inline __host__ __device__ half ceil(const half& a) { return hceil(a); -#else - float fa = __half2float(a); - return __float2half(ceilf(fa)); -#endif -#else // __CUDA_ARCH__ - return THC_float2half(ceilf(THC_half2float(a))); -#endif } - static inline __host__ __device__ half floor(half a) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static inline __host__ __device__ half floor(const half& a) { return hfloor(a); -#else - float fa = __half2float(a); - return __float2half(floorf(fa)); -#endif -#else // __CUDA_ARCH__ - return THC_float2half(floorf(THC_half2float(a))); -#endif } - static inline __host__ __device__ half trunc(half a) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static inline __host__ __device__ half trunc(const half& a) { return htrunc(a); -#else - float fa = __half2float(a); - return __float2half(truncf(fa)); -#endif -#else // __CUDA_ARCH__ - return THC_float2half(truncf(THC_half2float(a))); -#endif } - static inline __host__ __device__ half neg(half a) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static inline __host__ __device__ half neg(const half& a) { return __hneg(a); -#else - float fa = __half2float(a); - return __float2half(-fa); -#endif -#else // __CUDA_ARCH__ - return THC_float2half(-(THC_half2float(a))); -#endif - } - - static inline __host__ __device__ half acos(half a) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - return __float2half(acosf(fa)); -#else // __CUDA_ARCH__ - return THC_float2half(acosf(THC_half2float(a))); -#endif - } - - static inline __host__ __device__ half cosh(half a) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - return __float2half(coshf(fa)); -#else // __CUDA_ARCH__ - return THC_float2half(coshf(THC_half2float(a))); -#endif - } - - static inline __host__ __device__ half asin(half a) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - return __float2half(asinf(fa)); -#else // __CUDA_ARCH__ - return THC_float2half(asinf(THC_half2float(a))); -#endif } - static inline __host__ __device__ half sinh(half a) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - return __float2half(sinhf(fa)); -#else // __CUDA_ARCH__ - return THC_float2half(sinhf(THC_half2float(a))); -#endif - } - - static inline __host__ __device__ half tan(half a) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - return __float2half(tanf(fa)); -#else // __CUDA_ARCH__ - return THC_float2half(tanf(THC_half2float(a))); -#endif - } - - static inline __host__ __device__ half atan(half a) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - return __float2half(atanf(fa)); -#else // __CUDA_ARCH__ - return THC_float2half(atanf(THC_half2float(a))); -#endif - } - - static inline __host__ __device__ half tanh(half a) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - return __float2half(tanhf(fa)); -#else // __CUDA_ARCH__ - return THC_float2half(tanhf(THC_half2float(a))); -#endif - } - - static inline __host__ __device__ half abs(half a) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - return __float2half(fabs(fa)); -#else // __CUDA_ARCH__ - return THC_float2half(fabs(THC_half2float(a))); -#endif - } - - static inline __host__ __device__ half round(half a) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - return __float2half(roundf(fa)); -#else // __CUDA_ARCH__ - return THC_float2half(roundf(THC_half2float(a))); -#endif - } - - static inline __host__ __device__ half frac(half a) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - return __float2half(fa - truncf(fa)); -#else // __CUDA_ARCH__ - float fa = THC_half2float(a); - return THC_float2half(fa - floorf(fa)); -#endif - } - - static inline __host__ __device__ half cinv(half a) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - return __float2half(1.0f / fa); -#else // __CUDA_ARCH__ - return THC_float2half(1.0f / THC_half2float(a)); -#endif - } - - static inline __host__ __device__ half add(half a, half b) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static inline __host__ __device__ const half& add(const half& a, const half& b) { return __hadd(a, b); -#else - float fa = __half2float(a); - float fb = __half2float(b); - return __float2half( fa + fb ); -#endif -#else // __CUDA_ARCH__ - return THC_float2half(THC_half2float(a) + THC_half2float(b)); -#endif } - - static inline __host__ __device__ half div(half a, half b) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - float fb = __half2float(b); - return __float2half( fa / fb ); -#else // __CUDA_ARCH__ - return THC_float2half(THC_half2float(a) / THC_half2float(b)); -#endif - } - - static inline __host__ __device__ half mul(half a, half b) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static inline __host__ __device__ half mul(const half& a, const half& b) { return __hmul(a, b); -#else - float fa = __half2float(a); - float fb = __half2float(b); - return __float2half( fa * fb ); -#endif -#else // __CUDA_ARCH__ - return THC_float2half(THC_half2float(a) * THC_half2float(b)); -#endif } - static inline __host__ __device__ half sub(half a, half b) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS + static inline __host__ __device__ half sub(const half& a, const half& b) { return __hsub(a, b); -#else - float fa = __half2float(a); - float fb = __half2float(b); - return __float2half( fa - fb ); -#endif -#else // __CUDA_ARCH__ - return THC_float2half(THC_half2float(a) - THC_half2float(b)); -#endif - } - - static inline __host__ __device__ half pow(half a, half b) { -#ifdef __CUDA_ARCH__ - float fa = __half2float(a); - float fb = __half2float(b); - return __float2half(powf(fa, fb)); -#else // __CUDA_ARCH__ - return THC_float2half(powf(THC_half2float(a), THC_half2float(b))); -#endif - } - -}; -#endif - -template <> -struct THCNumerics { - static inline __host__ __device__ float min() { return -FLT_MAX; } - static inline __host__ __device__ float max() { return FLT_MAX; } - - static inline __host__ __device__ bool lt(float a, float b) { return a < b; } - static inline __host__ __device__ bool le(float a, float b) { return a <= b; } - static inline __host__ __device__ bool gt(float a, float b) { return a > b; } - static inline __host__ __device__ bool ge(float a, float b) { return a >= b; } - static inline __host__ __device__ bool eq(float a, float b) { return a == b; } - static inline __host__ __device__ bool ne(float a, float b) { return a != b; } - - static inline __host__ __device__ float exp (float a) { return expf(a); } - static inline __host__ __device__ float log (float a) { return logf(a); } - static inline __host__ __device__ float log1p(float a) { return log1pf(a); } - static inline __host__ __device__ float cos (float a) { return cosf(a); } - static inline __host__ __device__ float sin (float a) { return sinf(a); } - static inline __host__ __device__ float sqrt (float a) { return sqrtf(a); } - static inline __host__ __device__ float rsqrt(float a) { return rsqrtf(a); } - static inline __host__ __device__ float ceil (float a) { return ceilf(a); } - static inline __host__ __device__ float floor(float a) { return floorf(a); } - static inline __host__ __device__ float trunc(float a) { return truncf(a); } - static inline __host__ __device__ float neg (float a) { return -a; } - static inline __host__ __device__ float acos (float a) { return acosf(a); } - static inline __host__ __device__ float cosh (float a) { return coshf(a); } - static inline __host__ __device__ float acosh(float a) { return acoshf(a); } - static inline __host__ __device__ float asin (float a) { return asinf(a); } - static inline __host__ __device__ float sinh (float a) { return sinhf(a); } - static inline __host__ __device__ float asinh(float a) { return asinhf(a); } - static inline __host__ __device__ float tan (float a) { return tanf(a); } - static inline __host__ __device__ float atan (float a) { return atanf(a); } - static inline __host__ __device__ float tanh (float a) { return tanhf(a); } - static inline __host__ __device__ float abs (float a) { return fabs(a); } - static inline __host__ __device__ float round(float a) { return roundf(a); } - static inline __host__ __device__ float frac (float a) { return a - truncf(a); } - static inline __host__ __device__ float cinv (float a) { return 1.0f / a; } - static inline __host__ __device__ float add (float a, float b) { return a + b; } - static inline __host__ __device__ float div (float a, float b) { return a / b; } - static inline __host__ __device__ float mul (float a, float b) { return a * b; } - static inline __host__ __device__ float sub (float a, float b) { return a - b; } - static inline __host__ __device__ float pow (float a, float b) { return powf(a, b); } -}; - -template <> -struct THCNumerics { - static inline __host__ __device__ double min() { return -DBL_MAX; } - static inline __host__ __device__ double max() { return DBL_MAX; } - - static inline __host__ __device__ bool lt(double a, double b) { return a < b; } - static inline __host__ __device__ bool le(double a, double b) { return a <= b; } - static inline __host__ __device__ bool gt(double a, double b) { return a > b; } - static inline __host__ __device__ bool ge(double a, double b) { return a >= b; } - static inline __host__ __device__ bool eq(double a, double b) { return a == b; } - static inline __host__ __device__ bool ne(double a, double b) { return a != b; } - - static inline __host__ __device__ double exp (double a) { return ::exp(a); } - static inline __host__ __device__ double log (double a) { return ::log(a); } - static inline __host__ __device__ double log1p(double a) { return ::log1p(a); } - static inline __host__ __device__ double cos (double a) { return ::cos(a); } - static inline __host__ __device__ double sin (double a) { return ::sin(a); } - static inline __host__ __device__ double sqrt (double a) { return ::sqrt(a); } - static inline __host__ __device__ double rsqrt(double a) { return ::rsqrt(a); } - static inline __host__ __device__ double ceil (double a) { return ::ceil(a); } - static inline __host__ __device__ double floor(double a) { return ::floor(a); } - static inline __host__ __device__ double trunc(double a) { return ::trunc(a); } - static inline __host__ __device__ double neg (double a) { return -a; } - static inline __host__ __device__ double acos (double a) { return ::acos(a); } - static inline __host__ __device__ double cosh (double a) { return ::cosh(a); } - static inline __host__ __device__ double acosh(double a) { return ::acosh(a); } - static inline __host__ __device__ double asin (double a) { return ::asin(a); } - static inline __host__ __device__ double sinh (double a) { return ::sinh(a); } - static inline __host__ __device__ double asinh(double a) { return ::asinh(a); } - static inline __host__ __device__ double tan (double a) { return ::tan(a); } - static inline __host__ __device__ double atan (double a) { return ::atan(a); } - static inline __host__ __device__ double tanh (double a) { return ::tanh(a); } - static inline __host__ __device__ double abs (double a) { return ::abs(a); } - static inline __host__ __device__ double round(double a) { return ::round(a); } - static inline __host__ __device__ double frac (double a) { return a - ::trunc(a); } - static inline __host__ __device__ double cinv (double a) { return 1.0 / a; } - static inline __host__ __device__ double add (double a, double b) { return a + b; } - static inline __host__ __device__ double div (double a, double b) { return a / b; } - static inline __host__ __device__ double mul (double a, double b) { return a * b; } - static inline __host__ __device__ double sub (double a, double b) { return a - b; } - static inline __host__ __device__ double pow (double a, double b) { return ::pow(a, b); } -}; - -/// `half` has some type conversion issues associated with it, since it -/// is a struct without a constructor/implicit conversion constructor. -/// We use this to convert scalar values to the given type that the -/// tensor expects. -template -struct ScalarConvert { - static __host__ __device__ Out to(const In v) { return (Out) v; } -}; - -#ifdef CUDA_HALF_TENSOR -template -struct ScalarConvert { - static __host__ __device__ Out to(const half v) { -#ifdef __CUDA_ARCH__ - return (Out) __half2float(v); -#else - return (Out) THC_half2float(v); -#endif - } -}; - -template -struct ScalarConvert { - static __host__ __device__ half to(const In v) { -#ifdef __CUDA_ARCH__ - return __float2half((float) v); -#else - return THC_float2half((float) v); -#endif - } -}; - -template <> -struct ScalarConvert { - static __host__ __device__ half to(const half v) { - return v; } }; #endif -#endif // THC_NUMERICS_INC +#endif // THC_NUMERICS_INC diff --git a/lib/THC/THCTensorMathPointwise.cuh b/lib/THC/THCTensorMathPointwise.cuh index c52e0827..f7c6d2af 100644 --- a/lib/THC/THCTensorMathPointwise.cuh +++ b/lib/THC/THCTensorMathPointwise.cuh @@ -12,51 +12,24 @@ template struct TensorSigmoidOp { __device__ __forceinline__ void operator()(T* out, T* in) const { - T one = (T) 1.0; - *out = one / (one + THCNumerics::exp(- *in)); + *out = THCNumerics::one / (THCNumerics::one + THCNumerics::exp(- *in)); } __device__ __forceinline__ void operator()(T* v) const { - T one = (T) 1.0; - *v = one / (one + THCNumerics::exp(- *v)); + *v = THCNumerics::one / (THCNumerics::one + THCNumerics::exp(- *v)); } }; -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorSigmoidOp { - __device__ __forceinline__ void operator()(half* out, half* in) const { -#ifdef CUDA_HALF_INSTRUCTIONS - half one = ScalarConvert::to(1); - *out = hdiv(one, __hadd(one, hexp(__hneg(*in)))); -#else - float fin = __half2float(*in); - *out = __float2half(1.0f / (1.0f + expf(- fin))); -#endif - } - - __device__ __forceinline__ void operator()(half* v) const { -#ifdef CUDA_HALF_INSTRUCTIONS - half one = ScalarConvert::to(1); - *v = hdiv(one, __hadd(one, hexp(__hneg(*v)))); -#else - float fv = __half2float(*v); - *v = __float2half(1.0f / (1.0f + expf(- fv))); -#endif - } -}; -#endif - template struct TensorSignOp { __device__ __forceinline__ void operator()(T* out, T* in) { - T orig = *in; - *out = (orig > 0) - (orig < 0); + const T& orig = *in; + *out = (THCNumerics::gt(orig, THCNumerics::zero) ? THCNumerics::one : + (orig < 0); } __device__ __forceinline__ void operator()(T* v) { - T orig = *v; - *v = (orig > 0) - (orig < 0); + operator()(v, v); } }; From acaf1a64d4fed30de3fa322926cd1babcfa1cd3b Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Sun, 16 Oct 2016 22:27:04 -0700 Subject: [PATCH 04/13] Checkpoint --- lib/THC/THCAtomics.cuh | 4 +- lib/THC/THCHalf.cu | 6 + lib/THC/THCHalf.h | 40 +- lib/THC/THCNumerics.cuh | 129 ++++--- lib/THC/THCTensorMathPairwise.cu | 215 +---------- lib/THC/THCTensorMathPointwise.cuh | 429 +++++----------------- lib/THC/THCTensorTypeUtils.cuh | 51 --- lib/THC/generic/THCTensorMathPointwise.cu | 4 +- 8 files changed, 200 insertions(+), 678 deletions(-) diff --git a/lib/THC/THCAtomics.cuh b/lib/THC/THCAtomics.cuh index 42291144..606a776a 100644 --- a/lib/THC/THCAtomics.cuh +++ b/lib/THC/THCAtomics.cuh @@ -98,12 +98,12 @@ static inline __device__ void atomicAdd(half *address, half val) { (unsigned int *) ((char *)address - ((size_t)address & 2)); unsigned int old = *address_as_ui; unsigned int assumed; - + typedef THCNumerics N_; do { assumed = old; half hsum; hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); - hsum = THCNumerics::add(hsum, val); + hsum = N_::s_(N_::add(hsum, val)); old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x; old = atomicCAS(address_as_ui, assumed, old); } while (assumed != old); diff --git a/lib/THC/THCHalf.cu b/lib/THC/THCHalf.cu index 4f317102..cd957605 100644 --- a/lib/THC/THCHalf.cu +++ b/lib/THC/THCHalf.cu @@ -1,4 +1,5 @@ #include "THCHalf.h" +#include "THCNumerics.cuh" #include #include @@ -29,3 +30,8 @@ void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len) { #endif in, in + len, out, __half2floatOp()); } + +#if defined (__CUDA_ARCH__) && defined (CUDA_FP16_INSTRINTICS) +template <> const half THCMathTraitsBase::one() { return THC_FLOAT_TO_HALF(1.); } +template <> const half THCMathTraitsBase::zero(){ return THC_FLOAT_TO_HALF(0.); } +#endif diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h index 11c3bb94..f4c0eb71 100644 --- a/lib/THC/THCHalf.h +++ b/lib/THC/THCHalf.h @@ -8,23 +8,6 @@ # define CUDA_HALF_TENSOR 1 # endif -/* This define forces use of 32-bit float math on 16-bit float type 'half' (a.k.a. "pseudo-fp16 mode") - even if native harware support is available. - This makes difference for Pascal (6.x) cards only: Maxwell (5.x) cards always run 'half' in pseudo mode. - !!! Uncomment on your own risk !!! - Native fp16 operations may in fact run slower than pseudo-fp16 on your system at the moment - (especially if the bulk of your code is in CUDNN and not Cutorch). -*/ - -# define FORCE_PSEUDO_FP16 1 - -# ifndef FORCE_PSEUDO_FP16 -/* Kernel side: Native fp16 ALU instructions are available if we have this: */ -# if defined(CUDA_HALF_TENSOR) && (CUDA_VERSION >= 8000) && (__CUDA_ARCH__ >= 530) -# define CUDA_HALF_INSTRUCTIONS 1 -# endif -# endif - # ifdef CUDA_HALF_TENSOR # include @@ -43,27 +26,28 @@ THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, long len); # if defined (__CUDA_ARCH__) /* use instrintic functons defined for device only in cuda_fp16.h */ # define THC_FLOAT_TO_HALF(x) __float2half((float)x) -# define THC_HALF_TO_FLOAT(x) __half2float((float)x) +# define THC_HALF_TO_FLOAT(x) __half2float(x) # else /* use host conversion functions */ # define THC_FLOAT_TO_HALF(x) THC_float2half((float)x) -# define THC_HALF_TO_FLOAT(x) THC_half2float((float)x) +# define THC_HALF_TO_FLOAT(x) THC_half2float(x) # endif -/* Basic wrapper for 'half */ +#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) +# define CUDA_FP16_INSTRINTICS 1 +#endif + +/* Basic wrapper for 'native' half */ struct Half: public half { + public: const half& val() const { return *this; } half& val() { return *this; } - const half& operator half() const { return val(); } Half(const half& v): half(v) {} - Half(const Half& v): half(v.val) {} + Half(const Half& v): half(v.val()) {} }; - /* "pseudo-fp16" type: 16-bit storage, float math */ -struct PseudoHalf: public Half { - PseudoHalf(const Half& v): Half(v.val) {} -}; +typedef half PseudoHalf; /* Check for native fp16 support on the current device (CC 5.3+) */ THC_EXTERNC int THC_nativeHalfInstructions(THCState *state); @@ -83,12 +67,12 @@ __host__ __device__ __forceinline__ bool operator!=(const half& a, const half& b template struct ScalarConvert { - static inline __host__ __device__ const Out& to(const In& v) { return Out(v); } + static inline __host__ __device__ Out to(const In& v) { return Out(v); } }; template struct ScalarConvert { - static __host__ __device__ __forceinline__ const Out& to(const half& v) { + static __host__ __device__ __forceinline__ Out to(const half& v) { return (Out) THC_HALF_TO_FLOAT(v); } }; diff --git a/lib/THC/THCNumerics.cuh b/lib/THC/THCNumerics.cuh index 5817d59d..1c1b3cb2 100644 --- a/lib/THC/THCNumerics.cuh +++ b/lib/THC/THCNumerics.cuh @@ -5,71 +5,75 @@ #include #include "THCHalf.h" -using std::numeric_limits; + +template +struct CudaNumericLimits +{ +}; template struct THCNumericsBase { }; template -struct THC_math_traits +struct THCMathTraitsBase { typedef T storage_type; /* type value should be converted to before doing math on it. - For most types, MathType==StorageType. + For most types except 16-bit floats, MathType==StorageType. */ typedef T math_type; /* type of expression , like (a*b). Usually == MathType */ typedef T expr_type; + + static const storage_type one(); + static const storage_type zero(); }; -/* default handling for bare half is pseudo */ -template <> -struct THC_math_traits +template +const T THCMathTraitsBase::zero() { return T(0); } + +template +const T THCMathTraitsBase::one() { return T(1); } + +template +struct THCMathTraits: public THCMathTraitsBase { - typedef half storage_type; - typedef float math_type; - typedef float expr_type; -} +}; +/* default handling for bare half is pseudo */ template <> -struct THC_math_traits +struct THCMathTraits: public THCMathTraitsBase { - typedef half storage_type; typedef float math_type; - typedef float expr_type; -} + typedef half expr_type; +}; template <> -struct THC_math_traits +struct THCMathTraits: public THCMathTraitsBase { - typedef half storage_type; typedef half math_type; typedef half expr_type; -} +}; -/// Class for numeric limits of the particular data type, which -/// includes support for `half`. template -struct THCNumericsBase { - - typedef THC_math_traits traits; +struct THCNumericsCommonBase { + typedef THCMathTraits traits; typedef typename traits::storage_type storage_type; typedef typename traits::math_type math_type; typedef typename traits::expr_type expr_type; - static const math_type one = 1; - static const math_type zero = 0; - - static __host__ __device__ __forceinline__ const math_type& m_(const storage_type& a) { + static __host__ __device__ __forceinline__ math_type m_(const storage_type& a) { return ScalarConvert::to(a); } - static __host__ __device__ __forceinline__ const expr_type& e_(const math_type& a) { + static __host__ __device__ __forceinline__ expr_type e_(const math_type& a) { return ScalarConvert::to(a); } - - static inline __host__ __device__ expr_type min() { return e_(numeric_limits::min()); } - static inline __host__ __device__ expt_type max() { return e_(numeric_limits::max()); } + static __host__ __device__ __forceinline__ storage_type s_(const expr_type& a) { + return ScalarConvert::to(a); + } + static __host__ __device__ const T min(); + static __host__ __device__ const T max(); static inline __host__ __device__ bool lt(const storage_type& a, const storage_type& b) { return m_(a) < m_(b); } static inline __host__ __device__ bool le(const storage_type& a, const storage_type& b) { return m_(a) <= m_(b); } @@ -78,20 +82,48 @@ struct THCNumericsBase { static inline __host__ __device__ bool eq(const storage_type& a, const storage_type& b) { return m_(a) == m_(b); } static inline __host__ __device__ bool ne(const storage_type& a, const storage_type& b) { return m_(a) != m_(b); } - static inline __host__ __device__ const expr_type& add(const storage_type& a, const storage_type& b) { return e_(m_(a) + m_(b)); } - static inline __host__ __device__ const expr_type& mul(const storage_type& a, const storage_type& b) { return e_(m_(a) * m_(b)); } - static inline __host__ __device__ const expr_type& sub(const storage_type& a, const storage_type& b) { return e_(m_(a) - m_(b)); } - static inline __host__ __device__ const expr_type& div(const storage_type& a, const storage_type& b) { return e_(m_(a) / m_(b)); } - static inline __host__ __device__ const expr_type& abs(const storage_type& a) { return e_(abs(m_(a))); } + static inline __host__ __device__ expr_type add(const storage_type& a, const storage_type& b) { return e_(m_(a) + m_(b)); } + static inline __host__ __device__ expr_type mul(const storage_type& a, const storage_type& b) { return e_(m_(a) * m_(b)); } + static inline __host__ __device__ expr_type sub(const storage_type& a, const storage_type& b) { return e_(m_(a) - m_(b)); } + static inline __host__ __device__ expr_type div(const storage_type& a, const storage_type& b) { return e_(m_(a) / m_(b)); } + static inline __host__ __device__ expr_type abs(const storage_type& a) { return e_(abs(m_(a))); } + static inline __host__ __device__ expr_type neg(const storage_type& a) { return e_(-m_(a)); } + static inline __host__ __device__ expr_type pow (const storage_type& a, T b) { return e_(::pow((double)a, (double)b)); } + }; template -struct THCNumericsBase: public THCNumericsBase { +__host__ __device__ const T THCNumericsCommonBase::min() { return std::numeric_limits::min(); } + +template +__host__ __device__ const T THCNumericsCommonBase::max() { return std::numeric_limits::max(); } + +/* specialized versions */ +template <> +const half THCNumericsCommonBase::min(); + +template <> +const half THCNumericsCommonBase::max(); - static const math_type one = 1.0; - static const math_type zero = 0.; +/// Class for numeric limits of the particular data type, which +/// includes support for `half`. +template +struct THCNumericsBase : public THCNumericsCommonBase { +}; - static inline __host__ __device__ expr_type exp (const storage_type& a) { return e_(::exp(m_(a)); } +template +struct THCNumericsBase : public THCNumericsCommonBase { + typedef THCNumericsCommonBase Base; + using typename Base::traits; + using typename Base::math_type; + using typename Base::expr_type; + using typename Base::storage_type; + using Base::e_; + using Base::m_; + using Base::s_; + + + static inline __host__ __device__ expr_type exp (const storage_type& a) { return e_(::exp(m_(a))); } static inline __host__ __device__ expr_type log (const storage_type& a) { return e_(::log(m_(a))); } static inline __host__ __device__ expr_type log1p(const storage_type& a) { return e_(::log1p(m_(a))); } static inline __host__ __device__ expr_type cos (const storage_type& a) { return e_(::cos(m_(a))); } @@ -101,7 +133,6 @@ struct THCNumericsBase: public THCNumericsBase { static inline __host__ __device__ expr_type ceil (const storage_type& a) { return e_(::ceil(m_(a))); } static inline __host__ __device__ expr_type floor(const storage_type& a) { return e_(::floor(m_(a))); } static inline __host__ __device__ expr_type trunc(const storage_type& a) { return e_(::trunc(m_(a))); } - static inline __host__ __device__ expr_type neg (const storage_type& a) { return e_(-m_(a)); } static inline __host__ __device__ expr_type acos (const storage_type& a) { return e_(::acos(m_(a))); } static inline __host__ __device__ expr_type cosh (const storage_type& a) { return e_(::cosh(m_(a))); } static inline __host__ __device__ expr_type acosh(const storage_type& a) { return e_(::acosh(m_(a))); } @@ -114,29 +145,24 @@ struct THCNumericsBase: public THCNumericsBase { static inline __host__ __device__ expr_type abs (const storage_type& a) { return e_(::abs(m_(a))); } static inline __host__ __device__ expr_type round(const storage_type& a) { return e_(::round(m_(a))); } static inline __host__ __device__ expr_type frac (const storage_type& a) { return e_(m_(a) - ::trunc(m_(a))); } - static inline __host__ __device__ expr_type cinv (const storage_type& a) { return e_(one / m_(m_(a)))); } - static inline __host__ __device__ expr_type pow (const storage_type& a, T b) { return e_(::pow(a, b)); } + static inline __host__ __device__ expr_type cinv (const storage_type& a) { return Base::div(Base::traits::one(), a); } + static inline __host__ __device__ expr_type pow (const storage_type& a, T b) { return e_(::pow(m_(a), m_(b))); } }; template -struct THCNumerics: public THCNumericsBase::is_integer> { +struct THCNumerics: public THCNumericsBase::is_integer> { }; -/* do we need this ? */ -template <> -struct THCNumerics : public THCNumericsBase { - static inline __host__ __device__ long abs(long a) { return labs(a); } -}; - - #ifdef CUDA_HALF_TENSOR template <> -struct THCNumerics: public THCNumericsBase { +struct THCNumerics: public THCNumericsBase { }; +#if defined (__CUDA_ARCH__) && defined (CUDA_FP16_INSTRINTICS) template <> struct THCNumerics: public THCNumericsBase { + static inline __host__ __device__ bool lt(const half& a, const half& b) { return __hlt(a, b); } @@ -204,6 +230,7 @@ struct THCNumerics: public THCNumericsBase { return __hsub(a, b); } }; +# endif #endif #endif // THC_NUMERICS_INC diff --git a/lib/THC/THCTensorMathPairwise.cu b/lib/THC/THCTensorMathPairwise.cu index 2695f2df..bec31035 100644 --- a/lib/THC/THCTensorMathPairwise.cu +++ b/lib/THC/THCTensorMathPairwise.cu @@ -7,237 +7,56 @@ template struct TensorAddConstantOp { + typedef THCNumerics N_; TensorAddConstantOp(T v) : val(v) {} __device__ __forceinline__ void operator()(T* out, T* in) { - *out = *in + val; + *out = N_::s_(N_::add(*in, val)); } - __device__ __forceinline__ void operator()(T* v) { - *v += val; - } - - const T val; -}; - -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorAddConstantOp { -#ifdef CUDA_HALF_INSTRUCTIONS - TensorAddConstantOp(half v) : val(v) {} -#else - TensorAddConstantOp(half v) : fval(THC_half2float(v)) {} -#endif - - __device__ __forceinline__ void operator()(half* out, half* in) { -#ifdef CUDA_HALF_INSTRUCTIONS - *out = __hadd(*in, val); -#else - float fin = __half2float(*in); - float fout = fin + fval; - *out = __float2half(fout); -#endif - } - - __device__ __forceinline__ void operator()(half* v) { -#ifdef CUDA_HALF_INSTRUCTIONS - *v = __hadd(*v, val); -#else - float fv = __half2float(*v); - fv += fval; - *v = __float2half(fv); -#endif + this->operator()(v, v); } - -#ifdef CUDA_HALF_INSTRUCTIONS - const half val; -#else - const float fval; -#endif + const typename N_::storage_type val; }; -#endif // CUDA_HALF_TENSOR - template struct TensorSubConstantOp { + typedef THCNumerics N_; TensorSubConstantOp(T v) : val(v) {} __device__ __forceinline__ void operator()(T* out, T* in) { - *out = *in - val; + *out = N_::s_(N_::sub(*in, val)); } - __device__ __forceinline__ void operator()(T* v) { - *v -= val; - } - - const T val; -}; - - -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorSubConstantOp { -#ifdef CUDA_HALF_INSTRUCTIONS - TensorSubConstantOp(half v): val(THC_float2half(-(THC_half2float(v)))) {} -#else - TensorSubConstantOp(half v): fval(-(THC_half2float(v))) {} -#endif - - __device__ __forceinline__ void operator()(half* out, half* in) { -#ifdef CUDA_HALF_INSTRUCTIONS - *out = __hadd(*in, val); -#else - float fin = __half2float(*in); - float fout = fin + fval; - *out = __float2half(fout); -#endif + this->operator()(v, v); } - - __device__ __forceinline__ void operator()(half* v) { -#ifdef CUDA_HALF_INSTRUCTIONS - *v = __hadd(*v, val); -#else - float fv = __half2float(*v); - fv += fval; - *v = __float2half(fv); -#endif - } - -#ifdef CUDA_HALF_INSTRUCTIONS - const half val; -#else - const float fval; -#endif + const typename N_::storage_type val; }; -#endif // CUDA_HALF_TENSOR template struct TensorMulConstantOp { TensorMulConstantOp(T v) : val(v) {} + typedef THCNumerics N_; __device__ __forceinline__ void operator()(T* out, T* in) { - *out = *in * val; + *out = N_::s_(N_::mul(*in, val)); } - __device__ __forceinline__ void operator()(T* v) { - *v *= val; - } - - const T val; -}; - -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorMulConstantOp { -#ifdef CUDA_HALF_INSTRUCTIONS - TensorMulConstantOp(half v) : val(v) {} -#else - TensorMulConstantOp(half v) : fval(THC_half2float(v)) {} -#endif - - __device__ __forceinline__ void operator()(half* out, half* in) { -#ifdef CUDA_HALF_INSTRUCTIONS - *out = __hmul(*in, val); -#else - float fin = __half2float(*in); - float fout = fin * fval; - *out = __float2half(fout); -#endif - } - - __device__ __forceinline__ void operator()(half* v) { -#ifdef CUDA_HALF_INSTRUCTIONS - *v = __hmul(*v, val); -#else - float fv = __half2float(*v); - fv *= fval; - *v = __float2half(fv); -#endif + this->operator()(v, v); } - -#ifdef CUDA_HALF_INSTRUCTIONS - const half val; -#else - const float fval; -#endif + const typename N_::storage_type val; }; -#endif // CUDA_HALF_TENSOR template struct TensorDivConstantOp { - TensorDivConstantOp(T v) : val(v) {} + typedef THCNumerics N_; + TensorDivConstantOp(const T& v) : val(v) {} __device__ __forceinline__ void operator()(T* out, T* in) { - *out = *in / val; + *out = N_::s_(N_::div(*in, val)); } - __device__ __forceinline__ void operator()(T* v) { - *v /= val; - } - - const T val; -}; - -template <> -struct TensorDivConstantOp { - TensorDivConstantOp(float v) : val(1.f / v) {} - __device__ __forceinline__ void operator()(float* out, float* in) { - *out = *in * val; - } - - __device__ __forceinline__ void operator()(float* v) { - *v *= val; - } - - const float val; -}; - -template <> -struct TensorDivConstantOp { - TensorDivConstantOp(double v) : val(1. / v) {} - __device__ __forceinline__ void operator()(double* out, double* in) { - *out = *in * val; - } - - __device__ __forceinline__ void operator()(double* v) { - *v *= val; - } - - const double val; -}; - -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorDivConstantOp { -#ifdef CUDA_HALF_INSTRUCTIONS - TensorDivConstantOp(half v) : val(ScalarInv::to(v)) {} -#else - TensorDivConstantOp(half v) : fval(1.f / THC_half2float(v)) {} -#endif - __device__ __forceinline__ void operator()(half* out, half* in) { -#ifdef CUDA_HALF_INSTRUCTIONS - *out = __hmul(*in, val); -#else - float fin = __half2float(*in); - float fout = fin * fval; - *out = __float2half(fout); -#endif + this->operator()(v, v); } - - __device__ __forceinline__ void operator()(half* v) { -#ifdef CUDA_HALF_INSTRUCTIONS - *v = __hmul(*v, val); -#else - float fv = __half2float(*v); - fv *= fval; - *v = __float2half(fv); -#endif - } - -#ifdef CUDA_HALF_INSTRUCTIONS - const half val; -#else - const float fval; -#endif + const typename N_::storage_type val; }; -#endif // CUDA_HALF_TENSOR template struct TensorTriOp { diff --git a/lib/THC/THCTensorMathPointwise.cuh b/lib/THC/THCTensorMathPointwise.cuh index f7c6d2af..dc696b18 100644 --- a/lib/THC/THCTensorMathPointwise.cuh +++ b/lib/THC/THCTensorMathPointwise.cuh @@ -11,417 +11,163 @@ template struct TensorSigmoidOp { + typedef THCNumerics N_; + typedef typename N_::traits traits; __device__ __forceinline__ void operator()(T* out, T* in) const { - *out = THCNumerics::one / (THCNumerics::one + THCNumerics::exp(- *in)); + *out = N_::div(traits::one(), N_::add(traits::one(), N_::neg(*in))); } - __device__ __forceinline__ void operator()(T* v) const { - *v = THCNumerics::one / (THCNumerics::one + THCNumerics::exp(- *v)); + this->operator()(v, v); } }; template struct TensorSignOp { + typedef THCNumerics N_; + typedef typename N_::traits traits; + __device__ __forceinline__ void operator()(T* out, T* in) { const T& orig = *in; - *out = (THCNumerics::gt(orig, THCNumerics::zero) ? THCNumerics::one : - (orig < 0); + *out = (N_::gt(orig, traits::zero()) ? traits::one() : + N_::lt(orig, traits::zero()) ? N_::neg(traits::one()) : + traits::zero()); } - __device__ __forceinline__ void operator()(T* v) { - operator()(v, v); - } -}; - -template <> -struct TensorSignOp { - __device__ __forceinline__ void operator()(unsigned char* out, unsigned char* in) { - unsigned char orig = *in; - *out = (orig == 0) ? 0 : 1; - } - - __device__ __forceinline__ void operator()(unsigned char* v) { - unsigned char orig = *v; - *v = (orig == 0) ? 0 : 1; - } -}; - -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorSignOp { - __device__ __forceinline__ void operator()(half* out, half* in) { -#ifdef CUDA_HALF_INSTRUCTIONS - half zero = ScalarConvert::to(0); - half orig = *in; - *out = __float2half((float) __hgt(orig, zero) - (float) __hlt(orig, zero)); -#else - float orig = __half2float(*in); - *out = __float2half((orig > 0) - (orig < 0)); -#endif - } - - __device__ __forceinline__ void operator()(half* v) { -#ifdef CUDA_HALF_INSTRUCTIONS - half zero = ScalarConvert::to(0); - half orig = *v; - *v = __float2half((float) __hgt(orig, zero) - (float) __hlt(orig, zero)); -#else - float orig = __half2float(*v); - *v = __float2half((orig > 0) - (orig < 0)); -#endif + this->operator()(v, v); } }; -#endif template struct TensorAddOp { + typedef THCNumerics N_; + __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { + *out = N_::s_(N_::add(*in1, *in2)); + } __device__ __forceinline__ void operator()(T* out, T* in) { - *out += *in; + this->operator()(out, in, out); } - __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { - *out = *in1 + *in2; - } }; -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorAddOp { - __device__ __forceinline__ void operator()(half* out, half* in) { -#ifdef CUDA_HALF_INSTRUCTIONS - *out = __hadd(*out, *in); -#else - float fout = __half2float(*out); - float fin = __half2float(*in); - fout += fin; - *out = __float2half(fout); -#endif - } - - __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { -#ifdef CUDA_HALF_INSTRUCTIONS - *out = __hadd(*in1, *in2); -#else - float fin1 = __half2float(*in1); - float fin2 = __half2float(*in2); - float fout = fin1 + fin2; - *out = __float2half(fout); -#endif - } -}; -#endif // CUDA_HALF_TENSOR template struct TensorCAddOp { + typedef THCNumerics N_; TensorCAddOp(T v) : val(v) {} - __device__ __forceinline__ void operator()(T* out, T* in) { - *out += val * *in; + this->operator()(out, out, in); } - __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { - *out = *in1 + val * *in2; + *out = N_::s_(N_::add(*in1, N_::mul(val, *in2))); } - T val; + const typename N_::storage_type val; }; -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorCAddOp { - TensorCAddOp(half v) : val(v) {} - - __device__ __forceinline__ void operator()(half* out, half* in) { -#ifdef CUDA_HALF_INSTRUCTIONS - *out = __hadd(*out, __hmul(val, *in)); -#else - float fout = __half2float(*out); - float fval = __half2float(val); - float fin = __half2float(*in); - - fout += fval * fin; - *out = __float2half(fout); -#endif - } - - __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { -#ifdef CUDA_HALF_INSTRUCTIONS - *out = __hadd(*in1, __hmul(val, *in2)); -#else - float fin1 = __half2float(*in1); - float fin2 = __half2float(*in2); - float fval = __half2float(val); - - float fout = fin1 + fval * fin2; - *out = __float2half(fout); -#endif - } - - half val; -}; -#endif // CUDA_HALF_TENSOR template struct TensorSubOp { - __device__ __forceinline__ void operator()(T* out, T* in) { - *out -= *in; - } - + typedef THCNumerics N_; __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { - *out = *in1 - *in2; + *out = N_::s_(N_::sub(*in1, *in2)); } -}; - -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorSubOp { - __device__ __forceinline__ void operator()(half* out, half* in) { -#ifdef CUDA_HALF_INSTRUCTIONS - *out = __hsub(*out, *in); -#else - float fout = __half2float(*out); - float fin = __half2float(*in); - fout -= fin; - *out = __float2half(fout); -#endif - } - - __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { -#ifdef CUDA_HALF_INSTRUCTIONS - *out = __hsub(*in1, *in2); -#else - float fin1 = __half2float(*in1); - float fin2 = __half2float(*in2); - float fout = fin1 - fin2; - *out = __float2half(fout); -#endif + __device__ __forceinline__ void operator()(T* out, T* in) { + this->operator()(out, out, in); } }; -#endif // CUDA_HALF_TENSOR template struct TensorMulOp { - __device__ __forceinline__ void operator()(T* out, T* in) { - *out *= *in; - } - + typedef THCNumerics N_; __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { - *out = *in1 * *in2; + *out = N_::s_(N_::mul(*in1, *in2)); } -}; - -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorMulOp { - __device__ __forceinline__ void operator()(half* out, half* in) { -#ifdef CUDA_HALF_INSTRUCTIONS - *out = __hmul(*out, *in); -#else - float fout = __half2float(*out); - float fin = __half2float(*in); - fout *= fin; - *out = __float2half(fout); -#endif - } - - __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { -#ifdef CUDA_HALF_INSTRUCTIONS - *out = __hmul(*in1, *in2); -#else - float fin1 = __half2float(*in1); - float fin2 = __half2float(*in2); - float fout = fin1 * fin2; - *out = __float2half(fout); -#endif + __device__ __forceinline__ void operator()(T* out, T* in) { + this->operator()(out, out, in); } }; -#endif // CUDA_HALF_TENSOR template struct TensorPowOp { - TensorPowOp(T v) : val(v) {} + typedef THCNumerics N_; + TensorPowOp(T v) : val(N_::s_(v)) {} __device__ __forceinline__ void operator()(T* out, T* in) { - *out = powf((float) *in, (float) val); + *out = N_::s_(N_::pow(*in, val)); } __device__ __forceinline__ void operator()(T* v) { - *v = powf((float) *v, (float) val); + this->operator()(v, v); } - - const T val; + const typename N_::storage_type val; }; -template <> -struct TensorPowOp { - TensorPowOp(double v) : val(v) {} - - __device__ __forceinline__ void operator()(double* out, double* in) { - *out = pow(*in, val); - } - - __device__ __forceinline__ void operator()(double* v) { - *v = pow(*v, val); - } - - const double val; -}; - -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorPowOp { - TensorPowOp(half v) : val(v) {} - - __device__ __forceinline__ void operator()(half* out, half* in) { - // No fp16 pow function yet - float fin = __half2float(*in); - float fval = __half2float(val); - float fout = powf(fin, fval); - *out = __float2half(fout); - } - - __device__ __forceinline__ void operator()(half* v) { - // No fp16 pow function yet - float fv = __half2float(*v); - float fval = __half2float(val); - float fout = powf(fv, fval); - *v = __float2half(fout); - } - - const half val; -}; -#endif // CUDA_HALF_TENSOR - template struct TensorCPowOp { - __device__ __forceinline__ void operator()(T* out, T* in) { - *out = powf((float) *out, (float) *in); - } - + typedef THCNumerics N_; __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { - *out = powf((float) *in1, (float) *in2); - } -}; - -template <> -struct TensorCPowOp { - __device__ __forceinline__ void operator()(double* out, double* in) { - *out = pow(*out, *in); + *out = N_::s_(N_::pow(*in1,*in2)); } - - __device__ __forceinline__ void operator()(double* out, double* in1, double* in2) { - *out = pow(*in1, *in2); - } -}; - -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorCPowOp { - __device__ __forceinline__ void operator()(half* out, half* in) { - // No fp16 pow function yet - float fout = __half2float(*out); - float fin = __half2float(*in); - fout = powf(fout, fin); - *out = __float2half(fout); - } - - __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { - // No fp16 pow function yet - float fin1 = __half2float(*in1); - float fin2 = __half2float(*in2); - float fout = powf(fin1, fin2); - *out = __float2half(fout); + __device__ __forceinline__ void operator()(T* out, T* in) { + this->operator()(out, out, in); } }; -#endif // CUDA_HALF_TENSOR template struct TensorDivOp { - __device__ __forceinline__ void - operator()(T* out, T* in) { - *out /= *in; - } - - __device__ __forceinline__ void - operator()(T* out, T* in1, T* in2) { - *out = *in1 / *in2; + typedef THCNumerics N_; + __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { + *out = N_::s_(N_::div(*in1,*in2)); } -}; - -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorDivOp { - __device__ __forceinline__ void - operator()(half* out, half* in) { - // No fp16 div instruction yet - float fout = __half2float(*out); - float fin = __half2float(*in); - fout /= fin; - *out = __float2half(fout); - } - - __device__ __forceinline__ void - operator()(half* out, half* in1, half* in2) { - // No fp16 div instruction yet - float fin1 = __half2float(*in1); - float fin2 = __half2float(*in2); - float fout = fin1 / fin2; - *out = __float2half(fout); + __device__ __forceinline__ void operator()(T* out, T* in) { + this->operator()(out, out, in); } }; -#endif // CUDA_HALF_TENSOR template struct TensorClampOp { + typedef THCNumerics N_; TensorClampOp(T min, T max) : minValue(min), maxValue(max) {} __device__ __forceinline__ void operator()(T* out, T* in) { - T val = THCNumerics::lt(*in, maxValue) ? *in : maxValue; - *out = THCNumerics::gt(minValue, val) ? minValue : val; + T val = N_::lt(*in, maxValue) ? *in : maxValue; + *out = N_::gt(minValue, val) ? minValue : val; } __device__ __forceinline__ void operator()(T* v) { - T val = THCNumerics::lt(*v, maxValue) ? *v : maxValue; - *v = THCNumerics::gt(minValue, val) ? minValue : val; + T val = N_::lt(*v, maxValue) ? *v : maxValue; + *v = N_::gt(minValue, val) ? minValue : val; } - - const T minValue; - const T maxValue; + const typename N_::storage_type minValue; + const typename N_::storage_type maxValue; }; template struct TensorLerpOp { + typedef THCNumerics N_; TensorLerpOp(T w) : w(w) {} - __device__ __forceinline__ void operator()(T *out, T *a, T *b) { - *out = THCNumerics::add( - *a, - THCNumerics::mul( - w, - THCNumerics::sub(*b, *a) - ) - ); + *out = N_::add(*a, N_::mul(w, N_::sub(*b, *a))); } - - const T w; + T w; }; template struct TensorCrossOp { + typedef THCNumerics N_; TensorCrossOp(long sx, long sy, long so) : sx(sx), sy(sy), so(so) {} - __device__ __forceinline__ void operator()(T* out, T* x, T*y) { - out[0 * so] = THCNumerics::sub( - THCNumerics::mul(x[1 * sx], y[2 * sy]), - THCNumerics::mul(x[2 * sx], y[1 * sy]) + out[0 * so] = N_::sub( + N_::mul(x[1 * sx], y[2 * sy]), + N_::mul(x[2 * sx], y[1 * sy]) ); - out[1 * so] = THCNumerics::sub( - THCNumerics::mul(x[2 * sx], y[0 * sy]), - THCNumerics::mul(x[0 * sx], y[2 * sy]) + out[1 * so] = N_::sub( + N_::mul(x[2 * sx], y[0 * sy]), + N_::mul(x[0 * sx], y[2 * sy]) ); - out[2 * so] = THCNumerics::sub( - THCNumerics::mul(x[0 * sx], y[1 * sy]), - THCNumerics::mul(x[1 * sx], y[0 * sy]) + out[2 * so] = N_::sub( + N_::mul(x[0 * sx], y[1 * sy]), + N_::mul(x[1 * sx], y[0 * sy]) ); } @@ -430,36 +176,39 @@ struct TensorCrossOp { template struct TensorMaxOp { + typedef THCNumerics N_; __device__ __forceinline__ void operator()(T* out, T* in) { - *out = THCNumerics::gt(*out, *in) ? *out : *in; + *out = N_::gt(*out, *in) ? *out : *in; } __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { - *out = THCNumerics::gt(*in1, *in2) ? *in1 : *in2; + *out = N_::gt(*in1, *in2) ? *in1 : *in2; } }; template struct TensorMinOp { + typedef THCNumerics N_; __device__ __forceinline__ void operator()(T* out, T* in) { - *out = THCNumerics::lt(*out, *in) ? *out : *in; + *out = N_::lt(*out, *in) ? *out : *in; } __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { - *out = THCNumerics::lt(*in1, *in2) ? *in1 : *in2; + *out = N_::lt(*in1, *in2) ? *in1 : *in2; } }; template struct TensorMaxValueOp { + typedef THCNumerics N_; TensorMaxValueOp(T v) : val(v) {} __device__ __forceinline__ void operator()(T* out) { - *out = THCNumerics::gt(*out, val) ? *out : val; + *out = N_::gt(*out, val) ? *out : val; } __device__ __forceinline__ void operator()(T* out, T* in) { - *out = THCNumerics::gt(*in, val) ? *in : val; + *out = N_::gt(*in, val) ? *in : val; } T val; @@ -467,51 +216,39 @@ struct TensorMaxValueOp { template struct TensorMinValueOp { + typedef THCNumerics N_; TensorMinValueOp(T v) : val(v) {} - __device__ __forceinline__ void operator()(T* out) { - *out = THCNumerics::lt(*out, val) ? *out : val; + *out = N_::lt(*out, val) ? *out : val; } - __device__ __forceinline__ void operator()(T* out, T* in) { - *out = THCNumerics::lt(*in, val) ? *in : val; + *out = N_::lt(*in, val) ? *in : val; } - T val; }; template struct TensorAddCMulOp { + typedef THCNumerics N_; TensorAddCMulOp(T v) : val(v) {} - __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { - *out = THCNumerics::add( - *out, - THCNumerics::mul( - val, - THCNumerics::mul(*in1, *in2) - ) - ); + *out = N_::add(*out,N_::mul(val,N_::mul(*in1, *in2))); } - T val; }; template struct TensorAddCDivOp { + typedef THCNumerics N_; TensorAddCDivOp(T v) : val(v) {} - __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { - *out = THCNumerics::add( - *out, - THCNumerics::mul( - val, - THCNumerics::div(*in1, *in2) - ) - ); + *out = N_::add( *out, + N_::mul(val, + N_::div(*in1, *in2) + ) + ); } - - T val; + typename N_::storage_type val; }; #endif // THC_TENSORMATH_POINTWISE_CUH diff --git a/lib/THC/THCTensorTypeUtils.cuh b/lib/THC/THCTensorTypeUtils.cuh index 4f5d5164..729e22a7 100644 --- a/lib/THC/THCTensorTypeUtils.cuh +++ b/lib/THC/THCTensorTypeUtils.cuh @@ -96,55 +96,4 @@ getTensorInfo(THCState* state, TensorType* t) { TensorUtils::getData(state, t), dims, sz, st); } -template -struct ScalarNegate { - static __host__ __device__ T to(const T v) { return -v; } -}; - -template -struct ScalarInv { - static __host__ __device__ T to(const T v) { return ((T) 1) / v; } -}; - -#ifdef CUDA_HALF_TENSOR -template <> -struct ScalarNegate { - static __host__ __device__ half to(const half v) { -#ifdef __CUDA_ARCH__ -#ifdef CUDA_HALF_INSTRUCTIONS - return __hneg(v); -#else - return __float2half(-__half2float(v)); -#endif -#else - half out = v; - out.x ^= 0x8000; // toggle sign bit - return out; -#endif - } -}; - -template <> -struct ScalarInv { - static __host__ __device__ half to(const half v) { -#ifdef __CUDA_ARCH__ - return __float2half(1.0f / __half2float(v)); -#else - float fv = THC_half2float(v); - fv = 1.0f / fv; - return THC_float2half(fv); -#endif - } -}; - -inline bool operator==(half a, half b) { - return a.x == b.x; -} - -inline bool operator!=(half a, half b) { - return a.x != b.x; -} - -#endif // CUDA_HALF_TENSOR - #endif // THC_TENSOR_TYPE_UTILS_INC diff --git a/lib/THC/generic/THCTensorMathPointwise.cu b/lib/THC/generic/THCTensorMathPointwise.cu index 26385042..287294db 100644 --- a/lib/THC/generic/THCTensorMathPointwise.cu +++ b/lib/THC/generic/THCTensorMathPointwise.cu @@ -245,7 +245,7 @@ THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, real value, // self += -value * src2 if (!THC_pointwiseApply2(state, self_, src2, TensorCAddOp( - ScalarNegate::to(value)))) { + THCNumerics::neg(value)))) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } } @@ -261,7 +261,7 @@ THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, real value, // self = src1 - value * src2 if (!THC_pointwiseApply3(state, self_, src1, src2, TensorCAddOp( - ScalarNegate::to(value)))) { + THCNumerics::neg(value)))) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } } From 18383174c82fbfd08a9bd8b84639aeec2e4ee22a Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Fri, 28 Oct 2016 17:13:04 -0700 Subject: [PATCH 05/13] checkpoint --- CMakeLists.txt | 2 +- Tensor.lua | 9 +--- generic/CStorage.c | 24 +-------- generic/CTensor.c | 11 ---- lib/THC/CMakeLists.txt | 2 +- lib/THC/THCGenerateHalfType.h | 1 - lib/THC/THCHalf.cu | 87 +++---------------------------- lib/THC/THCHalf.h | 7 +-- lib/THC/THCStorage.c | 2 - lib/THC/THCStorage.h | 2 +- lib/THC/THCTensorTypeUtils.cuh | 2 +- lib/THC/generic/THCStorageCopy.c | 75 +++++++++++++------------- lib/THC/generic/THCStorageCopy.cu | 24 +++++---- lib/THC/generic/THCStorageCopy.h | 10 ++-- torch/generic/Tensor.c | 2 +- 15 files changed, 77 insertions(+), 183 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 96224c44..c3698e68 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,7 +11,7 @@ IF (NOT WIN32) SET(CMAKE_C_FLAGS "-std=c99 -Werror=implicit-function-declaration ${CMAKE_C_FLAGS}") ENDIF (NOT WIN32) IF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5) - SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 ${CMAKE_C_FLAGS}") + SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 -DTH_GENERIC_USE_HALF=1 ${CMAKE_C_FLAGS}") ENDIF() INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS}) diff --git a/Tensor.lua b/Tensor.lua index 76c87295..0636d682 100644 --- a/Tensor.lua +++ b/Tensor.lua @@ -24,6 +24,7 @@ end local TensorTypes = { float = 'torch.FloatTensor', + half = 'torch.HalfTensor', double = 'torch.DoubleTensor', byte = 'torch.ByteTensor', char = 'torch.CharTensor', @@ -85,11 +86,5 @@ for ValueType, CudaTensorType in pairs(CudaTensorTypes) do end if cutorch.hasHalf then - do - local function Tensor__totable(self) - local host_tensor = self:float() - return self:float():totable() - end - rawset(torch.getmetatable('torch.CudaHalfTensor'), 'totable', Tensor__totable) - end + CudaTensorTypes.half = 'torch.CudaHalfTensor' end diff --git a/generic/CStorage.c b/generic/CStorage.c index a6503f28..d6d4db57 100644 --- a/generic/CStorage.c +++ b/generic/CStorage.c @@ -6,7 +6,6 @@ /* everything is as the generic Storage.c, except few things (see below) */ -#ifndef THC_REAL_IS_HALF #define THFile_readRealRaw(file, data, size) \ { \ real *fdata = (real*)THAlloc(sizeof(real)*size); \ @@ -22,23 +21,6 @@ TH_CONCAT_3(THFile_write,Real,Raw)(file, fdata, size); \ THFree(fdata); \ } -#else -#define THFile_readRealRaw(file, data, size) \ - { \ - real *fdata = (real*)THAlloc(sizeof(real)*size); \ - THFile_readCharRaw(file, (char *)fdata, sizeof(real) * size); \ - THCudaCheck(cudaMemcpy(data, fdata, size * sizeof(real), cudaMemcpyHostToDevice)); \ - THFree(fdata); \ - } - -#define THFile_writeRealRaw(file, data, size) \ - { \ - real *fdata = (real*)THAlloc(sizeof(real)*size); \ - THCudaCheck(cudaMemcpy(fdata, data, size * sizeof(real), cudaMemcpyDeviceToHost)); \ - THFile_writeCharRaw(file, (char *)fdata, size * sizeof(real)); \ - THFree(fdata); \ - } -#endif #define TH_GENERIC_FILE "generic/Storage.c" #include "generic/Storage.c" @@ -87,6 +69,8 @@ static int cutorch_Storage_(copy)(lua_State *L) THCStorage_(copyFloat)(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.DoubleStorage")) ) THCStorage_(copyDouble)(state, storage, src); + else if( (src = luaT_toudata(L, 2, "torch.HalfStorage")) ) + THCStorage_(copyHalf)(state, storage, src); else luaL_typerror(L, 2, "torch.*Storage"); @@ -94,7 +78,6 @@ static int cutorch_Storage_(copy)(lua_State *L) return 1; } -#ifndef THC_REAL_IS_HALF static int TH_CONCAT_3(cutorch_,Real,Storage_copy)(lua_State *L) { THStorage *storage = luaT_checkudata(L, 1, TH_CONCAT_STRING_3(torch.,Real,Storage)); @@ -139,7 +122,6 @@ static int TH_CONCAT_3(cutorch_,Real,Storage_copy)(lua_State *L) lua_settop(L, 1); return 1; } -#endif static int cutorch_Storage_(getDevice)(lua_State *L) { THCStorage *storage = luaT_checkudata(L, 1, torch_Storage); @@ -154,12 +136,10 @@ void cutorch_Storage_(init)(lua_State* L) // torch_Storage macro is defined in Storage.c produce the CudaTensor types // so I have to construct the normal torch types by hand -#ifndef THC_REAL_IS_HALF luaT_pushmetatable(L, TH_CONCAT_STRING_3(torch.,Real,Storage)); lua_pushcfunction(L, TH_CONCAT_3(cutorch_,Real,Storage_copy)); lua_setfield(L, -2, "copy"); lua_pop(L, 1); -#endif luaT_pushmetatable(L, torch_Storage); lua_pushcfunction(L, cutorch_Storage_(copy)); diff --git a/generic/CTensor.c b/generic/CTensor.c index a9663ff1..64ac76a9 100644 --- a/generic/CTensor.c +++ b/generic/CTensor.c @@ -56,7 +56,6 @@ static int cutorch_Tensor_(copy)(lua_State *L) return 1; } -#ifndef THC_REAL_IS_HALF static int cutorch_Tensor_(copyAsyncCPU)(lua_State *L) { #define STRINGIFY_TENSOR(x) TH_CONCAT_STRING_3(torch.,x,Tensor) @@ -74,10 +73,7 @@ static int cutorch_Tensor_(copyAsyncCPU)(lua_State *L) return 1; #undef STRINGIFY_TENSOR } -#endif - -#ifndef THC_REAL_IS_HALF static int TH_CONCAT_3(cutorch_,Real,Tensor_copy)(lua_State *L) { THTensor *tensor = luaT_checkudata(L, 1, TH_CONCAT_STRING_3(torch.,Real,Tensor)); @@ -122,9 +118,7 @@ static int TH_CONCAT_3(cutorch_,Real,Tensor_copy)(lua_State *L) lua_settop(L, 1); return 1; } -#endif -#ifndef THC_REAL_IS_HALF static int TH_CONCAT_3(cutorch_,Real,Tensor_copyAsyncCuda)(lua_State *L) { #define STRINGIFY_TENSOR(x) TH_CONCAT_STRING_3(torch.,x,Tensor) @@ -139,9 +133,6 @@ static int TH_CONCAT_3(cutorch_,Real,Tensor_copyAsyncCuda)(lua_State *L) return 1; #undef STRINGIFY_TENSOR } -#endif - - #ifdef THC_REAL_IS_FLOAT static void THFloatTensor_computesz(THFloatTensor *self, long **sz_, long **st_) @@ -248,7 +239,6 @@ void cutorch_Tensor_(init)(lua_State* L) lua_pop(L, 1); #endif -#ifndef THC_REAL_IS_HALF luaT_pushmetatable(L, TH_CONCAT_STRING_3(torch.,Real,Tensor)); lua_pushcfunction(L, TH_CONCAT_3(cutorch_,Real,Tensor_copy)); lua_setfield(L, -2, "copy"); @@ -264,7 +254,6 @@ void cutorch_Tensor_(init)(lua_State* L) lua_pushcfunction(L, cutorch_Tensor_(copyAsyncCPU)); lua_setfield(L, -2, "copyAsync"); lua_pop(L, 1); -#endif luaT_pushmetatable(L, torch_Tensor); lua_pushcfunction(L, cutorch_Tensor_(copy)); diff --git a/lib/THC/CMakeLists.txt b/lib/THC/CMakeLists.txt index b0813450..c82e4326 100644 --- a/lib/THC/CMakeLists.txt +++ b/lib/THC/CMakeLists.txt @@ -174,7 +174,7 @@ IF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5) MESSAGE(STATUS "Found CUDA with FP16 support, compiling with torch.CudaHalfTensor") LIST(APPEND src-cuda THCHalf.cu) LIST(APPEND CUDA_NVCC_FLAGS "-DCUDA_HAS_FP16=1") - SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 ${CMAKE_C_FLAGS}") + SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 -DTH_GENERIC_USE_HALF=1 ${CMAKE_C_FLAGS}") ELSE(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5) MESSAGE(STATUS "Could not find CUDA with FP16 support, compiling without torch.CudaHalfTensor") ENDIF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5) diff --git a/lib/THC/THCGenerateHalfType.h b/lib/THC/THCGenerateHalfType.h index ac592162..501239d7 100644 --- a/lib/THC/THCGenerateHalfType.h +++ b/lib/THC/THCGenerateHalfType.h @@ -5,7 +5,6 @@ #include "THCHalf.h" #ifdef CUDA_HALF_TENSOR - #define real half #define accreal float #define Real Half diff --git a/lib/THC/THCHalf.cu b/lib/THC/THCHalf.cu index 96397ffb..c574bf8d 100644 --- a/lib/THC/THCHalf.cu +++ b/lib/THC/THCHalf.cu @@ -32,92 +32,17 @@ void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len) { float THC_half2float(half a) { - unsigned int bits = a.x & 0x7fff; - unsigned int sign = a.x & 0x8000; - unsigned int exp = a.x & 0x7c00; - - bits <<= 13; - sign <<= 16; - - bits += 0x38000000U; - - // flush denormals to 0 - bits = (exp == 0 ? 0 : bits) | sign; - - union { - float f; - unsigned int v; - } conv; - conv.v = bits; - - return conv.f; + TH_half h; + h.x = a.x; + return TH_half2float(h); } -/* - Copyright (c) 2015, Norbert Juffa - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - half THC_float2half(float a) { - uint32_t ia; - uint16_t ir; - memcpy(&ia, &a, sizeof(float)); - - ir = (ia >> 16) & 0x8000; - if ((ia & 0x7f800000) == 0x7f800000) { - if ((ia & 0x7fffffff) == 0x7f800000) { - ir |= 0x7c00; /* infinity */ - } else { - ir = 0x7fff; /* canonical NaN */ - } - } else if ((ia & 0x7f800000) >= 0x33000000) { - int shift = (int)((ia >> 23) & 0xff) - 127; - if (shift > 15) { - ir |= 0x7c00; /* infinity */ - } else { - ia = (ia & 0x007fffff) | 0x00800000; /* extract mantissa */ - if (shift < -14) { /* denormal */ - ir |= ia >> (-1 - shift); - ia = ia << (32 - (-1 - shift)); - } else { /* normal */ - ir |= ia >> (24 - 11); - ia = ia << (32 - (24 - 11)); - ir = ir + ((14 + shift) << 10); - } - /* IEEE-754 round to nearest of even */ - if ((ia > 0x80000000) || ((ia == 0x80000000) && (ir & 1))) { - ir++; - } - } - } - half ret; - memcpy(&ret, &ir, sizeof(half)); - return ret; + TH_half th_res = TH_float2half(a); + ret.x = th_res.x ; + return ret ; } THC_EXTERNC int THC_nativeHalfInstructions(THCState *state) { diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h index 7c055e7a..dc77584b 100644 --- a/lib/THC/THCHalf.h +++ b/lib/THC/THCHalf.h @@ -8,9 +8,8 @@ #define CUDA_HALF_TENSOR 1 #endif -#ifdef CUDA_HALF_TENSOR +#include "THHalf.h" -#include #include THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len); @@ -24,6 +23,8 @@ THC_API int THC_nativeHalfInstructions(THCState *state); /* Check for performant native fp16 support on the current device */ THC_API int THC_fastHalfInstructions(THCState *state); -#endif /* CUDA_HALF_TENSOR */ +# undef TH_GENERIC_USE_HALF +# define TH_GENERIC_USE_HALF 1 + #endif diff --git a/lib/THC/THCStorage.c b/lib/THC/THCStorage.c index 669efa82..6fc9574e 100644 --- a/lib/THC/THCStorage.c +++ b/lib/THC/THCStorage.c @@ -2,7 +2,5 @@ #include "THCGeneral.h" #include "THAtomic.h" -#include "THCHalf.h" - #include "generic/THCStorage.c" #include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCStorage.h b/lib/THC/THCStorage.h index ac1cd709..025a3187 100644 --- a/lib/THC/THCStorage.h +++ b/lib/THC/THCStorage.h @@ -1,8 +1,8 @@ #ifndef THC_STORAGE_INC #define THC_STORAGE_INC +#include "THCHalf.h" #include "THStorage.h" -#include "THCGeneral.h" #define THCStorage TH_CONCAT_3(TH,CReal,Storage) #define THCStorage_(NAME) TH_CONCAT_4(TH,CReal,Storage_,NAME) diff --git a/lib/THC/THCTensorTypeUtils.cuh b/lib/THC/THCTensorTypeUtils.cuh index 81051f75..25472960 100644 --- a/lib/THC/THCTensorTypeUtils.cuh +++ b/lib/THC/THCTensorTypeUtils.cuh @@ -71,7 +71,7 @@ TENSOR_UTILS(THCudaCharTensor, char, long); TENSOR_UTILS(THCudaShortTensor, short, long); TENSOR_UTILS(THCudaIntTensor, int, long); TENSOR_UTILS(THCudaLongTensor, long, long); -TENSOR_UTILS(THCudaTensor, float, float); +TENSOR_UTILS(THCudaTensor, float, double); TENSOR_UTILS(THCudaDoubleTensor, double, double); #ifdef CUDA_HALF_TENSOR diff --git a/lib/THC/generic/THCStorageCopy.c b/lib/THC/generic/THCStorageCopy.c index af5dbcc2..bf7a628f 100644 --- a/lib/THC/generic/THCStorageCopy.c +++ b/lib/THC/generic/THCStorageCopy.c @@ -2,40 +2,36 @@ #define THC_GENERIC_FILE "generic/THCStorageCopy.c" #else -#ifndef THC_REAL_IS_HALF void THCStorage_(copyCPU)(THCState *state, THCStorage *self, struct THStorage *src) { THArgCheck(self->size == src->size, 2, "size does not match"); THCudaCheck(cudaMemcpy(self->data, src->data, self->size * sizeof(real), cudaMemcpyHostToDevice)); } -#endif #ifndef THC_REAL_IS_HALF -#define TH_CUDA_STORAGE_IMPLEMENT_COPY(TYPEC) \ - void THCStorage_(copy##TYPEC)(THCState *state, THCStorage *self, struct TH##TYPEC##Storage *src) \ - { \ - if(THCTypeIdx_(Real) == THCTypeIdx_(TYPEC)) { \ - THCStorage_(copyCPU)(state, self, (THStorage*) src); /* cast just removes compiler warning */ \ - } else { \ - THStorage *buffer; \ - THArgCheck(self->size == src->size, 2, "size does not match"); \ - buffer = THStorage_(newWithSize)(src->size); \ +#define TH_CUDA_STORAGE_IMPLEMENT_COPY_AUX(TYPEC) \ + THStorage *buffer= THStorage_(newWithSize)(src->size); \ THStorage_(copy##TYPEC)(buffer, src); \ THCStorage_(copyCPU)(state, self, buffer); \ - THStorage_(free)(buffer); \ - } \ - } + THStorage_(free)(buffer); #else +#define TH_CUDA_STORAGE_IMPLEMENT_COPY_AUX(TYPEC) \ + THCudaStorage *buffer = THCudaStorage_newWithSize(state, src->size); \ + THCudaStorage_copy##TYPEC(state, buffer, src); \ + THCFloat2Half(state, self->data, buffer->data, src->size); \ + THCudaStorage_free(state, buffer); +#endif + #define TH_CUDA_STORAGE_IMPLEMENT_COPY(TYPEC) \ void THCStorage_(copy##TYPEC)(THCState *state, THCStorage *self, struct TH##TYPEC##Storage *src) \ { \ + if(THCTypeIdx_(Real) == THCTypeIdx_(TYPEC)) { \ + THCStorage_(copyCPU)(state, self, (THStorage*) src); /* cast just removes compiler warning */ \ + } else { \ THArgCheck(self->size == src->size, 2, "size does not match"); \ - THCudaStorage *buffer = THCudaStorage_newWithSize(state, src->size); \ - THCudaStorage_copy##TYPEC(state, buffer, src); \ - THCFloat2Half(state, self->data, buffer->data, src->size); \ - THCudaStorage_free(state, buffer); \ + TH_CUDA_STORAGE_IMPLEMENT_COPY_AUX(TYPEC) \ + } \ } -#endif TH_CUDA_STORAGE_IMPLEMENT_COPY(Byte) TH_CUDA_STORAGE_IMPLEMENT_COPY(Char) @@ -43,52 +39,57 @@ TH_CUDA_STORAGE_IMPLEMENT_COPY(Short) TH_CUDA_STORAGE_IMPLEMENT_COPY(Int) TH_CUDA_STORAGE_IMPLEMENT_COPY(Long) TH_CUDA_STORAGE_IMPLEMENT_COPY(Float) +#ifdef CUDA_HALF_TENSOR +TH_CUDA_STORAGE_IMPLEMENT_COPY(Half) +#endif TH_CUDA_STORAGE_IMPLEMENT_COPY(Double) -#ifndef THC_REAL_IS_HALF void THStorage_(copyCuda)(THCState *state, THStorage *self, struct THCStorage *src) { THArgCheck(self->size == src->size, 2, "size does not match"); THCudaCheck(cudaMemcpy(self->data, src->data, self->size * sizeof(real), cudaMemcpyDeviceToHost)); } -#endif #ifndef THC_REAL_IS_HALF -#define TH_CUDA_STORAGE_IMPLEMENT_COPYTO(TYPEC) \ - void TH_CONCAT_4(TH,TYPEC,Storage_copyCuda,Real)(THCState *state, TH##TYPEC##Storage *self, struct THCStorage *src) \ - { \ - if(THCTypeIdx_(Real) == THCTypeIdx_(TYPEC)) { \ - THStorage_(copyCuda)(state, (THStorage*) self, src); /* cast just removes compiler warnings */ \ - } else { \ +#define TH_CUDA_STORAGE_IMPLEMENT_COPYTO_AUX(TYPEC) \ THStorage *buffer; \ - THArgCheck(self->size == src->size, 2, "size does not match"); \ buffer = THStorage_(newWithSize)(src->size); \ THStorage_(copyCuda)(state, buffer, src); \ TH_CONCAT_4(TH,TYPEC,Storage_copy,Real)(self, buffer); \ - THStorage_(free)(buffer); \ - } \ - } + THStorage_(free)(buffer); #else -#define TH_CUDA_STORAGE_IMPLEMENT_COPYTO(TYPEC) \ - void TH_CONCAT_4(TH,TYPEC,Storage_copyCuda,Real)(THCState *state, TH##TYPEC##Storage *self, struct THCStorage *src) \ - { \ - THArgCheck(self->size == src->size, 2, "size does not match"); \ +#define TH_CUDA_STORAGE_IMPLEMENT_COPYTO_AUX(TYPEC) \ THCudaStorage *buffer = THCudaStorage_newWithSize(state, src->size);\ THCHalf2Float(state, buffer->data, src->data, src->size); \ TH_CONCAT_3(TH,TYPEC,Storage_copyCudaFloat)(state, self, buffer); \ - THCudaStorage_free(state, buffer); \ - } + THCudaStorage_free(state, buffer); #endif +#define TH_CUDA_STORAGE_IMPLEMENT_COPYTO(TYPEC) \ + void TH_CONCAT_4(TH,TYPEC,Storage_copyCuda,Real)(THCState *state, TH##TYPEC##Storage *self, struct THCStorage *src) \ + { \ + if(THCTypeIdx_(Real) == THCTypeIdx_(TYPEC)) { \ + THStorage_(copyCuda)(state, (THStorage*) self, src); /* cast just removes compiler warnings */ \ + } else { \ + THArgCheck(self->size == src->size, 2, "size does not match"); \ + TH_CUDA_STORAGE_IMPLEMENT_COPYTO_AUX(TYPEC) \ + } \ + } + TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Byte) TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Char) TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Short) TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Int) TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Long) TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Float) +#ifdef CUDA_HALF_TENSOR +TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Half) +#endif TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Double) #undef TH_CUDA_STORAGE_IMPLEMENT_COPY #undef TH_CUDA_STORAGE_IMPLEMENT_COPYTO +#undef TH_CUDA_STORAGE_IMPLEMENT_COPY_AUX +#undef TH_CUDA_STORAGE_IMPLEMENT_COPYTO_AUX #endif diff --git a/lib/THC/generic/THCStorageCopy.cu b/lib/THC/generic/THCStorageCopy.cu index 298f7179..1b224c43 100644 --- a/lib/THC/generic/THCStorageCopy.cu +++ b/lib/THC/generic/THCStorageCopy.cu @@ -43,14 +43,18 @@ void THCStorage_(copyCuda)(THCState *state, THCStorage *self, THCStorage *src) void THCStorage_(copyCuda##TYPEC)(THCState *state, THCStorage *self, struct THCuda##TYPECUDA##Storage *src) \ { \ THArgCheck(self->size == src->size, 2, "size does not match"); \ - if(THCTypeIdx_(TYPEC) == THCTypeIdxFloat) { \ - THCFloat2Half(state, self->data, (float*) src->data, src->size); /* cast removes compiler error */ \ - } else { \ - THCudaStorage *buffer = THCudaStorage_newWithSize(state, src->size); \ - THCudaStorage_copyCuda##TYPEC(state, buffer, src); \ - THCFloat2Half(state, self->data, buffer->data, buffer->size); \ - THCudaStorage_free(state, buffer); \ - } \ + if(THCTypeIdx_(Real) == THCTypeIdx_(TYPEC)) { \ + THCStorage_(copy)(state, self, (THCStorage*) src); /* cast just removes compiler warning */ \ + } else { \ + if(THCTypeIdx_(TYPEC) == THCTypeIdxFloat) { \ + THCFloat2Half(state, self->data, (float*) src->data, src->size); /* cast removes compiler error */ \ + } else { \ + THCudaStorage *buffer = THCudaStorage_newWithSize(state, src->size); \ + THCudaStorage_copyCuda##TYPEC(state, buffer, src); \ + THCFloat2Half(state, self->data, buffer->data, buffer->size); \ + THCudaStorage_free(state, buffer); \ + } \ + } \ } #endif @@ -62,7 +66,7 @@ THC_CUDA_STORAGE_IMPLEMENT_COPY(Long,Long) THC_CUDA_STORAGE_IMPLEMENT_COPY(Float,) // i.e. float THC_CUDA_STORAGE_IMPLEMENT_COPY(Double,Double) -#ifdef CUDA_HALF_TENSOR +#if defined (CUDA_HALF_TENSOR) #define FLOAT_COPY(TYPE) TH_CONCAT_3(TH, CReal, Storage_copyCudaFloat) void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *self, struct THCudaHalfStorage *src) { @@ -76,7 +80,7 @@ void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *self, struct THCudaH THCudaStorage_free(state, buffer); } } -#undef FLOAT_COPY +# undef FLOAT_COPY #endif // CUDA_HALF_TENSOR #undef THC_CUDA_STORAGE_IMPLEMENT_COPY diff --git a/lib/THC/generic/THCStorageCopy.h b/lib/THC/generic/THCStorageCopy.h index c3e56013..7f1ccb08 100644 --- a/lib/THC/generic/THCStorageCopy.h +++ b/lib/THC/generic/THCStorageCopy.h @@ -13,6 +13,7 @@ THC_API void THCStorage_(copyInt)(THCState *state, THCStorage *storage, struct T THC_API void THCStorage_(copyLong)(THCState *state, THCStorage *storage, struct THLongStorage *src); THC_API void THCStorage_(copyFloat)(THCState *state, THCStorage *storage, struct THFloatStorage *src); THC_API void THCStorage_(copyDouble)(THCState *state, THCStorage *storage, struct THDoubleStorage *src); +THC_API void THCStorage_(copyHalf)(THCState *state, THCStorage *storage, struct THHalfStorage *src); THC_API void THCStorage_(copyCudaByte)(THCState *state, THCStorage *storage, struct THCudaByteStorage *src); THC_API void THCStorage_(copyCudaChar)(THCState *state, THCStorage *storage, struct THCudaCharStorage *src); @@ -21,7 +22,8 @@ THC_API void THCStorage_(copyCudaInt)(THCState *state, THCStorage *storage, stru THC_API void THCStorage_(copyCudaLong)(THCState *state, THCStorage *storage, struct THCudaLongStorage *src); THC_API void THCStorage_(copyCudaFloat)(THCState *state, THCStorage *storage, struct THCudaStorage *src); THC_API void THCStorage_(copyCudaDouble)(THCState *state, THCStorage *storage, struct THCudaDoubleStorage *src); -#ifdef CUDA_HALF_TENSOR + +#if 0 /* def CUDA_HALF_TENSOR */ THC_API void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *storage, struct THCudaHalfStorage *src); #endif @@ -32,12 +34,12 @@ THC_API void TH_CONCAT_2(THIntStorage_copyCuda , Real)(THCState *state, THIntS THC_API void TH_CONCAT_2(THLongStorage_copyCuda , Real)(THCState *state, THLongStorage *self, struct THCStorage *src); THC_API void TH_CONCAT_2(THFloatStorage_copyCuda , Real)(THCState *state, THFloatStorage *self, struct THCStorage *src); THC_API void TH_CONCAT_2(THDoubleStorage_copyCuda, Real)(THCState *state, THDoubleStorage *self, struct THCStorage *src); +#ifdef CUDA_HALF_TENSOR +THC_API void TH_CONCAT_2(THHalfStorage_copyCuda, Real)(THCState *state, THHalfStorage *self, struct THCStorage *src); +#endif -/* There is no THHalfStorage */ -#ifndef THC_REAL_IS_HALF THC_API void THStorage_(copyCuda)(THCState *state, THStorage *self, THCStorage *src); THC_API void THCStorage_(copyCuda)(THCState *state, THCStorage *self, THCStorage *src); THC_API void THCStorage_(copyCPU)(THCState *state, THCStorage *self, THStorage *src); -#endif #endif diff --git a/torch/generic/Tensor.c b/torch/generic/Tensor.c index d7dcd53b..53b082c0 100644 --- a/torch/generic/Tensor.c +++ b/torch/generic/Tensor.c @@ -143,7 +143,7 @@ static int torch_Tensor_(new)(lua_State *L) luaL_error(L, "invalid element (not a number)"); } -#ifdef THC_REAL_IS_HALF +#ifndef THC_HALF half value = THC_float2half((float) lua_tonumber(L, -1)); #else real value = (real) lua_tonumber(L, -1); From 10ef05657e6f800127797e620c1ad2ced596d33e Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Sat, 29 Oct 2016 15:54:48 -0700 Subject: [PATCH 06/13] Using half from TH --- generic/CStorage.c | 2 -- generic/CTensor.c | 2 -- init.c | 5 ++--- lib/THC/THCAtomics.cuh | 2 +- lib/THC/THCBlas.cu | 2 -- lib/THC/THCBlas.h | 1 - lib/THC/THCGeneral.h.in | 10 ++++------ lib/THC/THCGenerateHalfType.h | 2 +- lib/THC/THCHalf.cu | 17 +---------------- lib/THC/THCHalf.h | 11 +++++++---- lib/THC/THCNumerics.cuh | 3 ++- lib/THC/THCStorage.cu | 2 -- lib/THC/THCStorage.h | 3 ++- lib/THC/THCStorageCopy.c | 3 --- lib/THC/THCStorageCopy.cu | 3 +-- lib/THC/THCStorageCopy.h | 1 - lib/THC/THCTensor.h | 3 ++- lib/THC/THCTensorCopy.c | 4 +--- lib/THC/THCTensorCopy.cu | 1 - lib/THC/THCTensorCopy.h | 1 - lib/THC/THCTensorIndex.cu | 6 +++--- lib/THC/THCTensorMath.h | 1 - lib/THC/THCTensorMathPairwise.cu | 3 +-- lib/THC/THCTensorMathPointwise.cuh | 3 +-- lib/THC/THCTensorTypeUtils.cu | 1 - lib/THC/THCTensorTypeUtils.cuh | 1 - lib/THC/generic/THCStorageCopy.h | 2 +- lib/THC/generic/THCTensorCopy.h | 9 ++++++--- test/test.lua | 4 ++-- torch/generic/Tensor.c | 4 +--- torch/utils.h | 2 ++ 31 files changed, 41 insertions(+), 73 deletions(-) diff --git a/generic/CStorage.c b/generic/CStorage.c index d6d4db57..9be795fd 100644 --- a/generic/CStorage.c +++ b/generic/CStorage.c @@ -2,8 +2,6 @@ #define THC_GENERIC_FILE "generic/CStorage.c" #else -#include "THCHalf.h" - /* everything is as the generic Storage.c, except few things (see below) */ #define THFile_readRealRaw(file, data, size) \ diff --git a/generic/CTensor.c b/generic/CTensor.c index 64ac76a9..e22ba59d 100644 --- a/generic/CTensor.c +++ b/generic/CTensor.c @@ -2,8 +2,6 @@ #define THC_GENERIC_FILE "generic/CTensor.c" #else -#include "THCHalf.h" - /* everything is as the generic Storage.c, except few things (see below) */ #define TH_GENERIC_FILE "generic/Tensor.c" diff --git a/init.c b/init.c index d424aa6e..07ad51be 100644 --- a/init.c +++ b/init.c @@ -3,7 +3,6 @@ #include "THCGeneral.h" #include "THCCachingAllocator.h" #include "THCTensorRandom.h" -#include "THCHalf.h" // for CUDA_HALF_TENSOR extern void cutorch_CudaByteStorage_init(lua_State* L); extern void cutorch_CudaCharStorage_init(lua_State* L); @@ -911,11 +910,11 @@ static int cutorch_hasHalfInstructions(lua_State *L) { static int cutorch_hasFastHalfInstructions(lua_State *L) { THCState *state = cutorch_getstate(L); -#ifdef CUDA_HALF_TENSOR +#ifdef CUDA_HALF_TENSOR lua_pushboolean(L, THC_fastHalfInstructions(state)); #else lua_pushboolean(L, 0); -#endif +#endif return 1; } diff --git a/lib/THC/THCAtomics.cuh b/lib/THC/THCAtomics.cuh index 42291144..31274e2c 100644 --- a/lib/THC/THCAtomics.cuh +++ b/lib/THC/THCAtomics.cuh @@ -1,7 +1,7 @@ #ifndef THC_ATOMICS_INC #define THC_ATOMICS_INC -#include "THCHalf.h" +#include "THCGeneral.h" template struct AtomicAddIntegerImpl; diff --git a/lib/THC/THCBlas.cu b/lib/THC/THCBlas.cu index e3462025..26ab4398 100644 --- a/lib/THC/THCBlas.cu +++ b/lib/THC/THCBlas.cu @@ -1,6 +1,4 @@ #include "THCBlas.h" -#include "THCGeneral.h" -#include "THCHalf.h" float THCudaBlas_Sdot(THCState *state, long n, float *x, long incx, float *y, long incy) { diff --git a/lib/THC/THCBlas.h b/lib/THC/THCBlas.h index 45f58eba..5a0c58d6 100644 --- a/lib/THC/THCBlas.h +++ b/lib/THC/THCBlas.h @@ -2,7 +2,6 @@ #define THC_BLAS_INC #include "THCGeneral.h" -#include "THCHalf.h" /* Level 1 */ THC_API float THCudaBlas_Sdot(THCState *state, long n, float *x, long incx, float *y, long incy); diff --git a/lib/THC/THCGeneral.h.in b/lib/THC/THCGeneral.h.in index c50cc1c7..a08bf5a1 100644 --- a/lib/THC/THCGeneral.h.in +++ b/lib/THC/THCGeneral.h.in @@ -6,12 +6,6 @@ #include "THCThreadLocal.h" #undef log1p -#include "cuda.h" -#include "cuda_runtime.h" -#include "cublas_v2.h" - -#cmakedefine USE_MAGMA - #ifdef __cplusplus # define THC_EXTERNC extern "C" #else @@ -44,6 +38,10 @@ struct THCRNGState; /* Random number generator state. */ typedef struct THCStream THCStream; typedef struct THCState THCState; +#include "THCHalf.h" + +#cmakedefine USE_MAGMA + typedef struct _THCDeviceAllocator { cudaError_t (*malloc)( void*, void**, size_t, cudaStream_t); cudaError_t (*realloc)(void*, void**, size_t, size_t, cudaStream_t); diff --git a/lib/THC/THCGenerateHalfType.h b/lib/THC/THCGenerateHalfType.h index 501239d7..6f46e7f0 100644 --- a/lib/THC/THCGenerateHalfType.h +++ b/lib/THC/THCGenerateHalfType.h @@ -2,7 +2,7 @@ #error "You must define THC_GENERIC_FILE before including THGenerateHalfType.h" #endif -#include "THCHalf.h" +#include "THCGeneral.h" #ifdef CUDA_HALF_TENSOR #define real half diff --git a/lib/THC/THCHalf.cu b/lib/THC/THCHalf.cu index c574bf8d..bb19d498 100644 --- a/lib/THC/THCHalf.cu +++ b/lib/THC/THCHalf.cu @@ -1,4 +1,4 @@ -#include "THCHalf.h" +#include "THCGeneral.h" #include #include @@ -30,21 +30,6 @@ void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len) { in, in + len, out, __half2floatOp()); } -float THC_half2float(half a) -{ - TH_half h; - h.x = a.x; - return TH_half2float(h); -} - -half THC_float2half(float a) -{ - half ret; - TH_half th_res = TH_float2half(a); - ret.x = th_res.x ; - return ret ; -} - THC_EXTERNC int THC_nativeHalfInstructions(THCState *state) { cudaDeviceProp* prop = THCState_getCurrentDeviceProperties(state); diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h index dc77584b..01b0a692 100644 --- a/lib/THC/THCHalf.h +++ b/lib/THC/THCHalf.h @@ -1,11 +1,14 @@ #ifndef THC_HALF_CONVERSION_INC #define THC_HALF_CONVERSION_INC -#include "THCGeneral.h" +#include "cuda.h" +#include "cuda_runtime.h" +#include "cublas_v2.h" +#include "cuda_fp16.h" /* We compile with CudaHalfTensor support if we have this: */ #if CUDA_VERSION >= 7050 || CUDA_HAS_FP16 -#define CUDA_HALF_TENSOR 1 +# define CUDA_HALF_TENSOR 1 #endif #include "THHalf.h" @@ -14,8 +17,8 @@ THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len); THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len); -THC_API half THC_float2half(float a); -THC_API float THC_half2float(half a); +# define THC_float2half(a) TH_float2half(a) +# define THC_half2float(a) TH_half2float(a) /* Check for native fp16 support on the current device (CC 5.3+) */ THC_API int THC_nativeHalfInstructions(THCState *state); diff --git a/lib/THC/THCNumerics.cuh b/lib/THC/THCNumerics.cuh index 09443605..4765048a 100644 --- a/lib/THC/THCNumerics.cuh +++ b/lib/THC/THCNumerics.cuh @@ -3,7 +3,8 @@ #include #include -#include "THCHalf.h" + +#include "THCGeneral.h" /// Class for numeric limits of the particular data type, which /// includes support for `half`. diff --git a/lib/THC/THCStorage.cu b/lib/THC/THCStorage.cu index a23794c2..2ceb0c7f 100644 --- a/lib/THC/THCStorage.cu +++ b/lib/THC/THCStorage.cu @@ -6,7 +6,5 @@ #include #endif -#include "THCHalf.h" - #include "generic/THCStorage.cu" #include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCStorage.h b/lib/THC/THCStorage.h index 025a3187..6a27fa34 100644 --- a/lib/THC/THCStorage.h +++ b/lib/THC/THCStorage.h @@ -1,7 +1,8 @@ #ifndef THC_STORAGE_INC #define THC_STORAGE_INC -#include "THCHalf.h" +#include "THCGeneral.h" + #include "THStorage.h" #define THCStorage TH_CONCAT_3(TH,CReal,Storage) diff --git a/lib/THC/THCStorageCopy.c b/lib/THC/THCStorageCopy.c index cf2bf8a6..721da7fe 100644 --- a/lib/THC/THCStorageCopy.c +++ b/lib/THC/THCStorageCopy.c @@ -1,7 +1,4 @@ #include "THCStorageCopy.h" -#include "THCGeneral.h" - -#include "THCHalf.h" #include "generic/THCStorageCopy.c" #include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCStorageCopy.cu b/lib/THC/THCStorageCopy.cu index 18496863..b245d90e 100644 --- a/lib/THC/THCStorageCopy.cu +++ b/lib/THC/THCStorageCopy.cu @@ -1,7 +1,6 @@ -#include "THCStorageCopy.h" #include "THCGeneral.h" +#include "THCStorageCopy.h" -#include "THCHalf.h" #include "generic/THCStorageCopy.cu" #include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCStorageCopy.h b/lib/THC/THCStorageCopy.h index 837056fc..ec8011d3 100644 --- a/lib/THC/THCStorageCopy.h +++ b/lib/THC/THCStorageCopy.h @@ -3,7 +3,6 @@ #include "THCStorage.h" #include "THCGeneral.h" -#include "THCHalf.h" #include "generic/THCStorageCopy.h" #include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCTensor.h b/lib/THC/THCTensor.h index d4eb49a3..8a3ab9ff 100644 --- a/lib/THC/THCTensor.h +++ b/lib/THC/THCTensor.h @@ -1,9 +1,10 @@ #ifndef THC_TENSOR_INC #define THC_TENSOR_INC +#include "THCGeneral.h" #include "THTensor.h" #include "THCStorage.h" -#include "THCGeneral.h" + #define THCTensor TH_CONCAT_3(TH,CReal,Tensor) #define THCTensor_(NAME) TH_CONCAT_4(TH,CReal,Tensor_,NAME) diff --git a/lib/THC/THCTensorCopy.c b/lib/THC/THCTensorCopy.c index 1bf8980d..9030ab6a 100644 --- a/lib/THC/THCTensorCopy.c +++ b/lib/THC/THCTensorCopy.c @@ -1,8 +1,6 @@ -#include "THCTensorCopy.h" #include "THCGeneral.h" #include "THCTensor.h" - -#include "THCHalf.h" +#include "THCTensorCopy.h" #include "generic/THCTensorCopy.c" #include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCTensorCopy.cu b/lib/THC/THCTensorCopy.cu index 8889939f..bb6425c5 100644 --- a/lib/THC/THCTensorCopy.cu +++ b/lib/THC/THCTensorCopy.cu @@ -1,5 +1,4 @@ #include "THCApply.cuh" -#include "THCHalf.h" #include "THCNumerics.cuh" inline int curGPU() { diff --git a/lib/THC/THCTensorCopy.h b/lib/THC/THCTensorCopy.h index e8bc4f4b..fc206cb7 100644 --- a/lib/THC/THCTensorCopy.h +++ b/lib/THC/THCTensorCopy.h @@ -3,7 +3,6 @@ #include "THCTensor.h" #include "THCGeneral.h" -#include "THCHalf.h" #include "generic/THCTensorCopy.h" #include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCTensorIndex.cu b/lib/THC/THCTensorIndex.cu index 415e6256..1fe3683f 100644 --- a/lib/THC/THCTensorIndex.cu +++ b/lib/THC/THCTensorIndex.cu @@ -1,10 +1,10 @@ -#include "THC.h" -#include "THCTensorMath.h" #include "THCGeneral.h" + +#include "THCTensorMath.h" #include "THCBlas.h" #include "THCTensorCopy.h" #include "THCTensorRandom.h" -#include "THCHalf.h" + #include "THCApply.cuh" #include "THCReduce.cuh" #include "THCDeviceUtils.cuh" diff --git a/lib/THC/THCTensorMath.h b/lib/THC/THCTensorMath.h index 3d714692..fd4d0ae0 100644 --- a/lib/THC/THCTensorMath.h +++ b/lib/THC/THCTensorMath.h @@ -2,7 +2,6 @@ #define TH_CUDA_TENSOR_MATH_INC #include "THCTensor.h" -#include "THCGeneral.h" #include "generic/THCTensorMath.h" #include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCTensorMathPairwise.cu b/lib/THC/THCTensorMathPairwise.cu index 2695f2df..c482ae78 100644 --- a/lib/THC/THCTensorMathPairwise.cu +++ b/lib/THC/THCTensorMathPairwise.cu @@ -1,6 +1,5 @@ -#include "THCTensorMath.h" #include "THCGeneral.h" -#include "THCHalf.h" +#include "THCTensorMath.h" #include "THCTensorCopy.h" #include "THCApply.cuh" #include "THCNumerics.cuh" diff --git a/lib/THC/THCTensorMathPointwise.cuh b/lib/THC/THCTensorMathPointwise.cuh index c52e0827..cd9055a7 100644 --- a/lib/THC/THCTensorMathPointwise.cuh +++ b/lib/THC/THCTensorMathPointwise.cuh @@ -1,9 +1,8 @@ #ifndef THC_TENSORMATH_POINTWISE_CUH #define THC_TENSORMATH_POINTWISE_CUH -#include "THCTensorMath.h" #include "THCGeneral.h" -#include "THCHalf.h" +#include "THCTensorMath.h" #include "THCTensorCopy.h" #include "THCApply.cuh" #include "THCNumerics.cuh" diff --git a/lib/THC/THCTensorTypeUtils.cu b/lib/THC/THCTensorTypeUtils.cu index a273a728..d3019a07 100644 --- a/lib/THC/THCTensorTypeUtils.cu +++ b/lib/THC/THCTensorTypeUtils.cu @@ -1,7 +1,6 @@ #include "THCTensorTypeUtils.cuh" #include "THCTensor.h" #include "THCTensorCopy.h" -#include "THCHalf.h" #include namespace { diff --git a/lib/THC/THCTensorTypeUtils.cuh b/lib/THC/THCTensorTypeUtils.cuh index 25472960..ce8f21ac 100644 --- a/lib/THC/THCTensorTypeUtils.cuh +++ b/lib/THC/THCTensorTypeUtils.cuh @@ -4,7 +4,6 @@ #include #include #include "THCGeneral.h" -#include "THCHalf.h" #include "THCTensor.h" #include "THCTensorInfo.cuh" diff --git a/lib/THC/generic/THCStorageCopy.h b/lib/THC/generic/THCStorageCopy.h index 7f1ccb08..c930a0f5 100644 --- a/lib/THC/generic/THCStorageCopy.h +++ b/lib/THC/generic/THCStorageCopy.h @@ -23,7 +23,7 @@ THC_API void THCStorage_(copyCudaLong)(THCState *state, THCStorage *storage, str THC_API void THCStorage_(copyCudaFloat)(THCState *state, THCStorage *storage, struct THCudaStorage *src); THC_API void THCStorage_(copyCudaDouble)(THCState *state, THCStorage *storage, struct THCudaDoubleStorage *src); -#if 0 /* def CUDA_HALF_TENSOR */ +#ifdef CUDA_HALF_TENSOR THC_API void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *storage, struct THCudaHalfStorage *src); #endif diff --git a/lib/THC/generic/THCTensorCopy.h b/lib/THC/generic/THCTensorCopy.h index 71d878d7..0bec3790 100644 --- a/lib/THC/generic/THCTensorCopy.h +++ b/lib/THC/generic/THCTensorCopy.h @@ -11,6 +11,9 @@ THC_API void THCTensor_(copyInt)(THCState *state, THCTensor *self, THIntTensor * THC_API void THCTensor_(copyLong)(THCState *state, THCTensor *self, THLongTensor *src); THC_API void THCTensor_(copyFloat)(THCState *state, THCTensor *self, THFloatTensor *src); THC_API void THCTensor_(copyDouble)(THCState *state, THCTensor *self, THDoubleTensor *src); +#ifdef CUDA_HALF_TENSOR +THC_API void THCTensor_(copyHalf)(THCState *state, THCTensor *self, struct THHalfTensor *src); +#endif THC_API void THCTensor_(copyCudaByte)(THCState *state, THCTensor *dst, struct THCudaByteTensor *src); THC_API void THCTensor_(copyCudaChar)(THCState *state, THCTensor *dst, struct THCudaCharTensor *src); @@ -30,15 +33,15 @@ THC_API void TH_CONCAT_2(THIntTensor_copyCuda , Real) (THCState *state, THInt THC_API void TH_CONCAT_2(THLongTensor_copyCuda , Real) (THCState *state, THLongTensor *self, THCTensor *src); THC_API void TH_CONCAT_2(THFloatTensor_copyCuda , Real) (THCState *state, THFloatTensor *self, THCTensor *src); THC_API void TH_CONCAT_2(THDoubleTensor_copyCuda, Real) (THCState *state, THDoubleTensor *self, THCTensor *src); +#ifdef CUDA_HALF_TENSOR +THC_API void TH_CONCAT_2(THHalfTensor_copyCuda, Real) (THCState *state, THDoubleTensor *self, THCTensor *src); +#endif THC_API void THCTensor_(copyCuda) (THCState *state, THCTensor *self, THCTensor *src); -/* There is no THHalfTensor */ -#ifndef THC_REAL_IS_HALF THC_API void THTensor_(copyCuda) (THCState *state, THTensor *self, THCTensor *src); THC_API void THCTensor_(copyCPU) (THCState *state, THCTensor *self, THTensor *src); THC_API void THCTensor_(copyAsyncCPU)(THCState *state, THCTensor *self, THTensor *src); THC_API void THTensor_(copyAsyncCuda)(THCState *state, THTensor *self, THCTensor *src); -#endif #endif diff --git a/test/test.lua b/test/test.lua index 058103d9..6525de95 100644 --- a/test/test.lua +++ b/test/test.lua @@ -2483,7 +2483,7 @@ function test.logNormal() local sz1 = chooseInt(minsize, maxsize) local sz2 = chooseInt(minsize, maxsize) local mean, std = torch.uniform(), 0.1 * torch.uniform() - local tolerance = 0.01 + local tolerance = 0.02 local t = torch.CudaTensor(sz1, sz2) t:logNormal(mean, std) @@ -3261,7 +3261,7 @@ function test.cat() end function test.catArray() - for k, typename in ipairs(typenames) do + for k, typename in ipairs(typenames) do for dim = 1, 3 do local x = torch.Tensor(13, minsize, minsize):uniform() :type(typename):transpose(1, dim) diff --git a/torch/generic/Tensor.c b/torch/generic/Tensor.c index 53b082c0..5e02ab1b 100644 --- a/torch/generic/Tensor.c +++ b/torch/generic/Tensor.c @@ -2,8 +2,6 @@ #define TH_GENERIC_FILE "generic/Tensor.c" #else -#include "THCHalf.h" - static void torch_Tensor_(c_readTensorStorageSizeStride)(lua_State *L, int index, int allowNone, int allowTensor, int allowStorage, int allowStride, THCStorage **storage_, ptrdiff_t *storageOffset_, THLongStorage **size_, THLongStorage **stride_); @@ -143,7 +141,7 @@ static int torch_Tensor_(new)(lua_State *L) luaL_error(L, "invalid element (not a number)"); } -#ifndef THC_HALF +#ifdef THC_REAL_IS_HALF half value = THC_float2half((float) lua_tonumber(L, -1)); #else real value = (real) lua_tonumber(L, -1); diff --git a/torch/utils.h b/torch/utils.h index ae959b73..74d68c66 100644 --- a/torch/utils.h +++ b/torch/utils.h @@ -1,6 +1,8 @@ #ifndef CUTORCH_UTILS_INC #define CUTORCH_UTILS_INC +#include "THCGeneral.h" + #include "luaT.h" #include "TH.h" From 838ec700c17cb9f2b942c8110184fa1eef4c1301 Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Tue, 15 Nov 2016 02:29:53 -0800 Subject: [PATCH 07/13] Implemented cudaMemGetInfo for caching allocator --- init.c | 31 ++++++++++++++++++--------- lib/THC/THCCachingAllocator.cpp | 37 +++++++++++++++++++++++++++++---- lib/THC/THCGeneral.c | 28 +++++++++++++++++++++++++ lib/THC/THCGeneral.h.in | 2 ++ 4 files changed, 84 insertions(+), 14 deletions(-) diff --git a/init.c b/init.c index 69f5583a..f7c97d36 100644 --- a/init.c +++ b/init.c @@ -694,19 +694,31 @@ static int cutorch_setKernelPeerToPeerAccess(lua_State *L) } static int cutorch_getMemoryUsage(lua_State *L) { - size_t freeBytes = 0; size_t totalBytes = 0; - int curDevice; - THCudaCheck(cudaGetDevice(&curDevice)); + size_t freeBytes = 0; + + THCState *state = cutorch_getstate(L); int device = luaL_optint(L, 1, -10); - if (device == -10) { /* no argument passed, current device mem usage */ - THCudaCheck(cudaMemGetInfo(&freeBytes, &totalBytes)); - } else { /* argument was given, particular device's memory usage */ - THCudaCheck(cudaSetDevice(device-1)); /* zero indexed */ - THCudaCheck(cudaMemGetInfo(&freeBytes, &totalBytes)); - THCudaCheck(cudaSetDevice(curDevice)); + if (device != -10) { /* no argument passed, current device mem usage */ + --device; } + + int prevDevice, curDevice = -10; + THCudaCheck(cudaGetDevice(&prevDevice)); + + if (device != -10) { /* no argument passed, current device mem usage */ + curDevice = device; /* zero indexed */ + if (curDevice != prevDevice) + THCudaCheck(cudaSetDevice(curDevice)); + } + + THCudaCheck(THCudaMemGetInfo(state, &totalBytes, &freeBytes)); + + if (curDevice != prevDevice) { /* restore current device if we have changed it */ + THCudaCheck(cudaSetDevice(prevDevice)); + } + lua_pushnumber(L, freeBytes); lua_pushnumber(L, totalBytes); return 2; @@ -714,7 +726,6 @@ static int cutorch_getMemoryUsage(lua_State *L) { static int cutorch_setDevice(lua_State *L) { - THCState *state = cutorch_getstate(L); int device = (int)luaL_checknumber(L, 1)-1; THCudaCheck(cudaSetDevice(device)); return 0; diff --git a/lib/THC/THCCachingAllocator.cpp b/lib/THC/THCCachingAllocator.cpp index e2fc8d85..ab9528e1 100644 --- a/lib/THC/THCCachingAllocator.cpp +++ b/lib/THC/THCCachingAllocator.cpp @@ -158,12 +158,12 @@ struct THCCachingAllocator allocated_blocks.erase(it); bool small = block->size <= kSmallAlloc; - auto& free_blocks = small ? large_blocks : small_blocks; - try_merge_blocks(block, block->prev, free_blocks); - try_merge_blocks(block, block->next, free_blocks); + auto& cur_free_blocks = small ? large_blocks : small_blocks; + try_merge_blocks(block, block->prev, cur_free_blocks); + try_merge_blocks(block, block->next, cur_free_blocks); block->allocated = false; - free_blocks.insert(block); + cur_free_blocks.insert(block); return cudaSuccess; } @@ -205,6 +205,27 @@ struct THCCachingAllocator return basePtr; } + // Accumulates sizes of all memory blocks for given device in given free list + void cacheInfoAux(FreeBlocks& blocks, int dev_id, size_t* total, size_t* largest) + { + Block search_key(dev_id, 0, 0); + auto it = blocks.lower_bound(&search_key); + for (;it != blocks.end() && *it && (*it)->device == dev_id; ++it) { + size_t blocksize = (*it)->size; + total += blocksize; + if (blocksize > *largest) + *largest = blocksize; + } + } + + void cacheInfo(int dev_id, size_t* total, size_t* largest) + { + std::lock_guard lock(mutex); + cacheInfoAux(large_blocks, dev_id, total, largest); + cacheInfoAux(small_blocks, dev_id, total, largest); + } + + /** combine previously split blocks */ void try_merge_blocks(Block* dst, Block* src, FreeBlocks& free_blocks) { @@ -327,12 +348,20 @@ static cudaError_t THCCachingAllocator_emptyCache(void* ctx) return a->emptyCache(); } +static cudaError_t THCCachingAllocator_cacheInfo(void* ctx, int dev_id, size_t* totalCached, size_t* largestBlock) +{ + THCCachingAllocator* a = (THCCachingAllocator*) ctx; + a->cacheInfo(dev_id, totalCached, largestBlock); + return cudaSuccess; +} + static THCCachingAllocator caching_allocator; static THCDeviceAllocator device_allocator = { &THCCachingAllocator_malloc, NULL, &THCCachingAllocator_free, &THCCachingAllocator_emptyCache, + &THCCachingAllocator_cacheInfo, &caching_allocator }; diff --git a/lib/THC/THCGeneral.c b/lib/THC/THCGeneral.c index 13f62be9..403c4fa6 100644 --- a/lib/THC/THCGeneral.c +++ b/lib/THC/THCGeneral.c @@ -41,6 +41,7 @@ static THCDeviceAllocator defaultDeviceAllocator = { NULL, &cudaFreeWrapper, NULL, + NULL, NULL }; @@ -710,6 +711,33 @@ cudaError_t THCudaFree(THCState *state, void *ptr) return allocator->free(allocator->state, ptr); } +cudaError_t THCudaMemGetInfo(THCState *state, size_t* freeBytes, size_t* totalBytes) +{ + size_t cachedBytes = 0; + size_t largestBlock = 0; + THCDeviceAllocator* allocator = state->cudaDeviceAllocator; + + /* get info from CUDA first */ + cudaError_t ret = cudaMemGetInfo(freeBytes, totalBytes); + if (ret!= cudaSuccess) + return ret; + + int device; + ret = cudaGetDevice(&device); + if (ret!= cudaSuccess) + return ret; + + /* not always true - our optimistic guess here */ + largestBlock = *freeBytes; + + if (allocator->cacheInfo != NULL) + allocator->cacheInfo(allocator->state, device, &cachedBytes, &largestBlock); + + /* Adjust resulting free bytes number. largesBlock unused for now */ + *freeBytes += cachedBytes; + return cudaSuccess; +} + static ptrdiff_t applyHeapDelta(THCState *state) { ptrdiff_t newHeapSize = THAtomicAddPtrdiff(&heapSize, state->heapDelta) + state->heapDelta; state->heapDelta = 0; diff --git a/lib/THC/THCGeneral.h.in b/lib/THC/THCGeneral.h.in index 8f55cf3f..c685d373 100644 --- a/lib/THC/THCGeneral.h.in +++ b/lib/THC/THCGeneral.h.in @@ -49,6 +49,7 @@ typedef struct _THCDeviceAllocator { cudaError_t (*realloc)(void*, void**, size_t, size_t, cudaStream_t); cudaError_t (*free)(void*, void*); cudaError_t (*emptyCache)(void*); + cudaError_t (*cacheInfo)(void*, int, size_t*, size_t*); void* state; } THCDeviceAllocator; @@ -177,6 +178,7 @@ THC_API void __THCublasCheck(cublasStatus_t status, const char *file, const int THC_API cudaError_t THCudaMalloc(THCState *state, void **ptr, size_t size); THC_API cudaError_t THCudaFree(THCState *state, void *ptr); +THC_API cudaError_t THCudaMemGetInfo(THCState *state, size_t* freeBytes, size_t* totalBytes); THC_API void THCSetGCHandler(THCState *state, void (*torchGCHandlerFunction)(void *data), void *data ); From 9aef731946f687e08c2ba4438ab985c433e3044f Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Tue, 15 Nov 2016 02:46:49 -0800 Subject: [PATCH 08/13] Added some memory allocations to test_shutdown to test acching allocator --- test/test_shutdown.lua | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/test/test_shutdown.lua b/test/test_shutdown.lua index e78a51e2..a5221642 100644 --- a/test/test_shutdown.lua +++ b/test/test_shutdown.lua @@ -1,11 +1,36 @@ local Threads = require 'threads' require 'cutorch' +local function test_cudaEvent() + cutorch.reserveStreams(2) + cutorch.setStream(1) + + local t1 = torch.CudaTensor(10000000):zero() + local t2 = torch.CudaTensor(1):zero() + + local t1View = t1:narrow(1, 10000000, 1) + t1:fill(1) + print('Memory usage after some allocations [free memory], [total memory]') + print(cutorch.getMemoryUsage()) + + -- Event is created here + local event = cutorch.Event() + + cutorch.setStream(2) + + -- assert below will fail without this + event:waitOn() + t2:copy(t1View) + + -- revert to default stream + cutorch.setStream(0) +end + print ("cutorch.hasHalf is ", cutorch.hasHalf) print('Memory usage before intialization of threads [free memory], [total memory]') print(cutorch.getMemoryUsage()) -threads = Threads(100, function() require 'cutorch' end) +threads = Threads(100, function() require 'cutorch'; test_cudaEvent(); end) print('Memory usage after intialization of threads [free memory], [total memory]') print(cutorch.getMemoryUsage()) threads:terminate() From 9a6ba41117663fcd670a2f2b412383dc5d0aa291 Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Tue, 15 Nov 2016 14:48:52 -0800 Subject: [PATCH 09/13] Bugfix, test extended --- lib/THC/THCCachingAllocator.cpp | 2 +- lib/THC/THCGeneral.c | 2 +- test/test_shutdown.lua | 40 +++++++++++++++++++++++++++------ 3 files changed, 35 insertions(+), 9 deletions(-) diff --git a/lib/THC/THCCachingAllocator.cpp b/lib/THC/THCCachingAllocator.cpp index ab9528e1..a4978fb8 100644 --- a/lib/THC/THCCachingAllocator.cpp +++ b/lib/THC/THCCachingAllocator.cpp @@ -212,7 +212,7 @@ struct THCCachingAllocator auto it = blocks.lower_bound(&search_key); for (;it != blocks.end() && *it && (*it)->device == dev_id; ++it) { size_t blocksize = (*it)->size; - total += blocksize; + *total += blocksize; if (blocksize > *largest) *largest = blocksize; } diff --git a/lib/THC/THCGeneral.c b/lib/THC/THCGeneral.c index 403c4fa6..547e060e 100644 --- a/lib/THC/THCGeneral.c +++ b/lib/THC/THCGeneral.c @@ -732,7 +732,7 @@ cudaError_t THCudaMemGetInfo(THCState *state, size_t* freeBytes, size_t* totalB if (allocator->cacheInfo != NULL) allocator->cacheInfo(allocator->state, device, &cachedBytes, &largestBlock); - + /* Adjust resulting free bytes number. largesBlock unused for now */ *freeBytes += cachedBytes; return cudaSuccess; diff --git a/test/test_shutdown.lua b/test/test_shutdown.lua index a5221642..e48a058d 100644 --- a/test/test_shutdown.lua +++ b/test/test_shutdown.lua @@ -10,8 +10,6 @@ local function test_cudaEvent() local t1View = t1:narrow(1, 10000000, 1) t1:fill(1) - print('Memory usage after some allocations [free memory], [total memory]') - print(cutorch.getMemoryUsage()) -- Event is created here local event = cutorch.Event() @@ -26,13 +24,41 @@ local function test_cudaEvent() cutorch.setStream(0) end -print ("cutorch.hasHalf is ", cutorch.hasHalf) +local Gig = 1024*1024*1024 + +local function test_getMemInfo() + local sz = Gig*0.1 + local t1 = torch.CudaTensor(sz):zero() + print('Memory usage after 1st allocation [free memory], [total memory]') + local total, free = cutorch.getMemoryUsage() + print(free/Gig, total/Gig) + local t2 = torch.CudaTensor(sz*1.3):zero() + print('Memory usage after 2nd allocation [free memory], [total memory]') + local total, free = cutorch.getMemoryUsage() + print(free/Gig, total/Gig) + t1 = nil + collectgarbage() + print('Memory usage after 1st deallocation [free memory], [total memory]') + local total, free = cutorch.getMemoryUsage() + print(free/Gig, total/Gig) + t2 = nil + collectgarbage() + print('Memory usage after 2nd deallocation [free memory], [total memory]') + total, free = cutorch.getMemoryUsage() + print(free/Gig, total/Gig) +end +print ("cutorch.hasHalf is ", cutorch.hasHalf) print('Memory usage before intialization of threads [free memory], [total memory]') -print(cutorch.getMemoryUsage()) -threads = Threads(100, function() require 'cutorch'; test_cudaEvent(); end) +local total, free = cutorch.getMemoryUsage() +print(free/Gig, total/Gig) +threads = Threads(20, function() require 'cutorch'; test_getMemInfo(); test_cudaEvent(); end) print('Memory usage after intialization of threads [free memory], [total memory]') -print(cutorch.getMemoryUsage()) +total, free = cutorch.getMemoryUsage() +print(free/Gig, total/Gig) threads:terminate() +collectgarbage() print('Memory usage after termination of threads [free memory], [total memory]') -print(cutorch.getMemoryUsage()) +total, free = cutorch.getMemoryUsage() +print(free/Gig, total/Gig) + From 287689458df744e707b321bb72d6ff2635948966 Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Thu, 17 Nov 2016 03:08:21 -0800 Subject: [PATCH 10/13] Added C++ flags --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index c3698e68..1ed12563 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,6 +12,7 @@ SET(CMAKE_C_FLAGS "-std=c99 -Werror=implicit-function-declaration ${CMAKE_C_FLAG ENDIF (NOT WIN32) IF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5) SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 -DTH_GENERIC_USE_HALF=1 ${CMAKE_C_FLAGS}") + SET(CMAKE_CXX_FLAGS "-DCUDA_HAS_FP16=1 -DTH_GENERIC_USE_HALF=1 ${CMAKE_CXX_FLAGS}") ENDIF() INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS}) From 0114d65f649e0eb528bf45691ddd4476c4a4db8b Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Sun, 15 Jan 2017 02:16:14 -0800 Subject: [PATCH 11/13] Build working --- init.c | 27 +--- lib/THC/CMakeLists.txt | 3 +- lib/THC/THCAtomics.cuh | 6 +- lib/THC/THCBlas.cu | 2 + lib/THC/THCBlas.h | 1 + lib/THC/THCCachingAllocator.cpp | 8 +- lib/THC/THCGeneral.c | 4 - lib/THC/THCGeneral.h.in | 10 +- lib/THC/THCGenerateHalfType.h | 2 +- lib/THC/THCHalf.cu | 1 - lib/THC/THCHalf.h | 30 ++-- lib/THC/THCNumerics.cuh | 178 ++++++++++++---------- lib/THC/THCStorage.c | 2 + lib/THC/THCStorage.cu | 2 + lib/THC/THCStorage.h | 3 +- lib/THC/THCStorageCopy.c | 1 + lib/THC/THCStorageCopy.cu | 2 +- lib/THC/THCStorageCopy.h | 1 + lib/THC/THCTensor.h | 3 +- lib/THC/THCTensorCopy.cu | 1 + lib/THC/THCTensorCopy.h | 1 + lib/THC/THCTensorIndex.cu | 6 +- lib/THC/THCTensorMath.h | 1 + lib/THC/THCTensorMathPairwise.cu | 4 +- lib/THC/THCTensorMathPointwise.cuh | 47 +----- lib/THC/THCTensorTypeUtils.cu | 1 + lib/THC/THCTensorTypeUtils.cuh | 54 ++++++- lib/THC/generic/THCStorageCopy.c | 4 +- lib/THC/generic/THCStorageCopy.h | 1 - lib/THC/generic/THCTensorCopy.h | 1 - lib/THC/generic/THCTensorMathBlas.cu | 2 +- lib/THC/generic/THCTensorMathPairwise.cu | 2 +- lib/THC/generic/THCTensorMathPointwise.cu | 4 +- lib/THC/generic/THCTensorMathReduce.cu | 8 +- 34 files changed, 215 insertions(+), 208 deletions(-) diff --git a/init.c b/init.c index 8f4eb452..1e2b6c49 100644 --- a/init.c +++ b/init.c @@ -699,16 +699,8 @@ static int cutorch_setKernelPeerToPeerAccess(lua_State *L) } static int cutorch_getMemoryUsage(lua_State *L) { - size_t totalBytes = 0; -<<<<<<< HEAD size_t freeBytes = 0; - - THCState *state = cutorch_getstate(L); - - int device = luaL_optint(L, 1, -10); - if (device != -10) { /* no argument passed, current device mem usage */ - --device; -======= + size_t totalBytes = 0; int curDevice; THCudaCheck(cudaGetDevice(&curDevice)); THCState *state = cutorch_getstate(L); @@ -720,24 +712,7 @@ static int cutorch_getMemoryUsage(lua_State *L) { THCudaCheck(cudaSetDevice(device-1)); /* zero indexed */ THCudaCheck(THCudaMemGetInfo(state, &freeBytes, &totalBytes)); THCudaCheck(cudaSetDevice(curDevice)); ->>>>>>> upstream/master } - - int prevDevice, curDevice = -10; - THCudaCheck(cudaGetDevice(&prevDevice)); - - if (device != -10) { /* no argument passed, current device mem usage */ - curDevice = device; /* zero indexed */ - if (curDevice != prevDevice) - THCudaCheck(cudaSetDevice(curDevice)); - } - - THCudaCheck(THCudaMemGetInfo(state, &totalBytes, &freeBytes)); - - if (curDevice != prevDevice) { /* restore current device if we have changed it */ - THCudaCheck(cudaSetDevice(prevDevice)); - } - lua_pushnumber(L, freeBytes); lua_pushnumber(L, totalBytes); return 2; diff --git a/lib/THC/CMakeLists.txt b/lib/THC/CMakeLists.txt index d5271098..a18451af 100644 --- a/lib/THC/CMakeLists.txt +++ b/lib/THC/CMakeLists.txt @@ -190,9 +190,8 @@ MESSAGE(STATUS "got cuda version " ${CUDA_VERSION}) IF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5) MESSAGE(STATUS "Found CUDA with FP16 support, compiling with torch.CudaHalfTensor") LIST(APPEND src-cuda THCHalf.cu) - LIST(APPEND src-cuda THCHalf.c) LIST(APPEND CUDA_NVCC_FLAGS "-DCUDA_HAS_FP16=1") - SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 -DTH_GENERIC_USE_HALF=1 ${CMAKE_C_FLAGS}") + SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 ${CMAKE_C_FLAGS}") ELSE(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5) MESSAGE(STATUS "Could not find CUDA with FP16 support, compiling without torch.CudaHalfTensor") ENDIF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5) diff --git a/lib/THC/THCAtomics.cuh b/lib/THC/THCAtomics.cuh index d03d1740..ac0b45f6 100644 --- a/lib/THC/THCAtomics.cuh +++ b/lib/THC/THCAtomics.cuh @@ -1,7 +1,7 @@ #ifndef THC_ATOMICS_INC #define THC_ATOMICS_INC -#include "THCGeneral.h" +#include "THCHalf.h" template struct AtomicAddIntegerImpl; @@ -98,12 +98,12 @@ static inline __device__ void atomicAdd(half *address, half val) { (unsigned int *) ((char *)address - ((size_t)address & 2)); unsigned int old = *address_as_ui; unsigned int assumed; - typedef THCNumerics N_; + do { assumed = old; half hsum; hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); - hsum = N_::s_(N_::add(hsum, val)); + hsum = THCNumerics::add(hsum, val); old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x; old = atomicCAS(address_as_ui, assumed, old); } while (assumed != old); diff --git a/lib/THC/THCBlas.cu b/lib/THC/THCBlas.cu index 8675945e..c438ad8e 100644 --- a/lib/THC/THCBlas.cu +++ b/lib/THC/THCBlas.cu @@ -1,4 +1,6 @@ #include "THCBlas.h" +#include "THCGeneral.h" +#include "THCHalf.h" float THCudaBlas_Sdot(THCState *state, long n, float *x, long incx, float *y, long incy) { diff --git a/lib/THC/THCBlas.h b/lib/THC/THCBlas.h index bd9a3807..bf91f936 100644 --- a/lib/THC/THCBlas.h +++ b/lib/THC/THCBlas.h @@ -2,6 +2,7 @@ #define THC_BLAS_INC #include "THCGeneral.h" +#include "THCHalf.h" /* Level 1 */ THC_API float THCudaBlas_Sdot(THCState *state, long n, float *x, long incx, float *y, long incy); diff --git a/lib/THC/THCCachingAllocator.cpp b/lib/THC/THCCachingAllocator.cpp index 8b43ef2b..eeae04a2 100644 --- a/lib/THC/THCCachingAllocator.cpp +++ b/lib/THC/THCCachingAllocator.cpp @@ -158,12 +158,12 @@ struct THCCachingAllocator allocated_blocks.erase(it); bool small = block->size <= kSmallAlloc; - auto& cur_free_blocks = small ? large_blocks : small_blocks; - try_merge_blocks(block, block->prev, cur_free_blocks); - try_merge_blocks(block, block->next, cur_free_blocks); + auto& free_blocks = small ? large_blocks : small_blocks; + try_merge_blocks(block, block->prev, free_blocks); + try_merge_blocks(block, block->next, free_blocks); block->allocated = false; - cur_free_blocks.insert(block); + free_blocks.insert(block); return cudaSuccess; } diff --git a/lib/THC/THCGeneral.c b/lib/THC/THCGeneral.c index fa699aa7..c442bd87 100644 --- a/lib/THC/THCGeneral.c +++ b/lib/THC/THCGeneral.c @@ -718,11 +718,7 @@ cudaError_t THCudaMemGetInfo(THCState *state, size_t* freeBytes, size_t* totalB /* not always true - our optimistic guess here */ largestBlock = *freeBytes; -<<<<<<< HEAD - -======= ->>>>>>> upstream/master if (allocator->cacheInfo != NULL) allocator->cacheInfo(allocator->state, device, &cachedBytes, &largestBlock); diff --git a/lib/THC/THCGeneral.h.in b/lib/THC/THCGeneral.h.in index c4eeff39..a88bd7db 100644 --- a/lib/THC/THCGeneral.h.in +++ b/lib/THC/THCGeneral.h.in @@ -6,6 +6,12 @@ #include "THCThreadLocal.h" #undef log1p +#include "cuda.h" +#include "cuda_runtime.h" +#include "cublas_v2.h" + +#cmakedefine USE_MAGMA + #ifdef __cplusplus # define THC_EXTERNC extern "C" #else @@ -38,10 +44,6 @@ struct THCRNGState; /* Random number generator state. */ typedef struct THCStream THCStream; typedef struct THCState THCState; -#include "THCHalf.h" - -#cmakedefine USE_MAGMA - typedef struct _THCDeviceAllocator { cudaError_t (*malloc)( void*, void**, size_t, cudaStream_t); cudaError_t (*realloc)(void*, void**, size_t, size_t, cudaStream_t); diff --git a/lib/THC/THCGenerateHalfType.h b/lib/THC/THCGenerateHalfType.h index 596c6b1e..77d4c0ad 100644 --- a/lib/THC/THCGenerateHalfType.h +++ b/lib/THC/THCGenerateHalfType.h @@ -2,7 +2,7 @@ #error "You must define THC_GENERIC_FILE before including THGenerateHalfType.h" #endif -#include "THCGeneral.h" +#include "THCHalf.h" #if defined(CUDA_HALF_TENSOR) || defined(FORCE_TH_HALF) diff --git a/lib/THC/THCHalf.cu b/lib/THC/THCHalf.cu index a9b468fe..023774e9 100644 --- a/lib/THC/THCHalf.cu +++ b/lib/THC/THCHalf.cu @@ -1,6 +1,5 @@ #include "THCHalf.h" #include "THCThrustAllocator.cuh" - #include #include diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h index 6a6d7304..5e35694e 100644 --- a/lib/THC/THCHalf.h +++ b/lib/THC/THCHalf.h @@ -13,9 +13,6 @@ #ifdef CUDA_HALF_TENSOR -# undef TH_GENERIC_USE_HALF -# define TH_GENERIC_USE_HALF 1 - #include "THCGeneral.h" #include "THHalf.h" @@ -23,8 +20,8 @@ THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len); THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len); -# define THC_float2half(a) TH_float2half(a) -# define THC_half2float(a) TH_half2float(a) +THC_API half THC_float2half(float a); +THC_API float THC_half2float(half a); /* Check for native fp16 support on the current device (CC 5.3+) */ THC_API int THC_nativeHalfInstructions(THCState *state); @@ -33,30 +30,23 @@ THC_API int THC_nativeHalfInstructions(THCState *state); THC_API int THC_fastHalfInstructions(THCState *state); # if defined (__CUDA_ARCH__) - /* use instrintic functons defined for device only in cuda_fp16.h */ # define THC_FLOAT_TO_HALF(x) __float2half((float)x) # define THC_HALF_TO_FLOAT(x) __half2float(x) +# define THC_DECL __host__ __device__ __forceinline__ # else /* use host conversion functions */ # define THC_FLOAT_TO_HALF(x) THC_float2half((float)x) # define THC_HALF_TO_FLOAT(x) THC_half2float(x) +# define THC_DECL inline # endif -#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) -# define CUDA_FP16_INSTRINTICS 1 +#if 0 // __CUDA_ARCH__ == 600 || __CUDA_ARCH__ >= 620 +# define CUDA_HALF_INSTRUCTIONS 1 #endif #if defined (__cplusplus__) || defined (__CUDACC__) -__host__ __device__ __forceinline__ bool operator==(const half& a, const half& b) { - return a.x == b.x; -} - -__host__ __device__ __forceinline__ bool operator!=(const half& a, const half& b) { - return a.x != b.x; -} - /// `half` has some type conversion issues associated with it, since it /// is a struct without a constructor/implicit conversion constructor. /// We use this to convert scalar values to the given type that the @@ -64,26 +54,26 @@ __host__ __device__ __forceinline__ bool operator!=(const half& a, const half& b template struct ScalarConvert { - static inline __host__ __device__ Out to(const In& v) { return Out(v); } + static THC_DECL Out to(const In& v) { return Out(v); } }; template struct ScalarConvert { - static __host__ __device__ __forceinline__ Out to(const half& v) { + static THC_DECL Out to(const half& v) { return (Out) THC_HALF_TO_FLOAT(v); } }; template struct ScalarConvert { - static __host__ __device__ __forceinline__ half to(const In& v) { + static THC_DECL half to(const In& v) { return THC_FLOAT_TO_HALF(v); } }; template <> struct ScalarConvert { - static __host__ __device__ __forceinline__ const half& to(const half& v) { + static THC_DECL const half& to(const half& v) { return v; } }; diff --git a/lib/THC/THCNumerics.cuh b/lib/THC/THCNumerics.cuh index 1fab1927..a5d558c1 100644 --- a/lib/THC/THCNumerics.cuh +++ b/lib/THC/THCNumerics.cuh @@ -8,20 +8,20 @@ template struct THCNumConstants { - static __host__ __device__ const T one() { return T(1); } - static __host__ __device__ const T zero() { return T(0); } - static __host__ __device__ const T min() { return std::numeric_limits::min(); } - static __host__ __device__ const T max() { return std::numeric_limits::max(); } + static THC_DECL const T one() { return T(1); } + static THC_DECL const T zero() { return T(0); } + static THC_DECL const T min() { return std::numeric_limits::min(); } + static THC_DECL const T max() { return std::numeric_limits::max(); } }; template <> struct THCNumConstants { - static __host__ __device__ const half one() { half ret = THC_FLOAT_TO_HALF(1.f); return ret;} /* TODO: use literal */ - static __host__ __device__ const half zero() { half ret; ret.x = 0; return ret;} - static __host__ __device__ const half min() { half ret; ret.x = 0x0400; return ret; } - static __host__ __device__ const half max() { half ret; ret.x = 0x7BFF; return ret; } + static THC_DECL const half one() { half ret = THC_FLOAT_TO_HALF(1.f); return ret;} /* TODO: use literal */ + static THC_DECL const half zero() { half ret; ret.x = 0; return ret;} + static THC_DECL const half min() { half ret; ret.x = 0xFBFF; return ret; } + static THC_DECL const half max() { half ret; ret.x = 0x7BFF; return ret; } }; template @@ -35,29 +35,34 @@ struct THCNumCommonBase { /* type of math operation result , like (a*b). Usually == StorageType */ typedef T expr_type; - static __host__ __device__ __forceinline__ math_type m_(const storage_type& a) { + static THC_DECL math_type m_(const storage_type& a) { return ScalarConvert::to(a); } - static __host__ __device__ __forceinline__ expr_type e_(const math_type& a) { + static THC_DECL expr_type e_(const math_type& a) { return ScalarConvert::to(a); } - static __host__ __device__ __forceinline__ storage_type s_(const expr_type& a) { + static THC_DECL storage_type s_(const expr_type& a) { return ScalarConvert::to(a); } - static inline __host__ __device__ bool lt(const storage_type& a, const storage_type& b) { return m_(a) < m_(b); } - static inline __host__ __device__ bool le(const storage_type& a, const storage_type& b) { return m_(a) <= m_(b); } - static inline __host__ __device__ bool gt(const storage_type& a, const storage_type& b) { return m_(a) > m_(b); } - static inline __host__ __device__ bool ge(const storage_type& a, const storage_type& b) { return m_(a) >= m_(b); } - static inline __host__ __device__ bool eq(const storage_type& a, const storage_type& b) { return m_(a) == m_(b); } - static inline __host__ __device__ bool ne(const storage_type& a, const storage_type& b) { return m_(a) != m_(b); } - - static inline __host__ __device__ expr_type add(const storage_type& a, const storage_type& b) { return e_(m_(a) + m_(b)); } - static inline __host__ __device__ expr_type mul(const storage_type& a, const storage_type& b) { return e_(m_(a) * m_(b)); } - static inline __host__ __device__ expr_type sub(const storage_type& a, const storage_type& b) { return e_(m_(a) - m_(b)); } - static inline __host__ __device__ expr_type div(const storage_type& a, const storage_type& b) { return e_(m_(a) / m_(b)); } - static inline __host__ __device__ expr_type abs(const storage_type& a) { bool isneg = (a<0); return e_(isneg ? -a : a); } - static inline __host__ __device__ expr_type neg(const storage_type& a) { return e_(-m_(a)); } - static inline __host__ __device__ expr_type pow (const storage_type& a, T b) { return e_(::pow((double)a, (double)b)); } + + static THC_DECL const T min() { return THCNumConstants::min(); } + static THC_DECL const T max() { return THCNumConstants::max(); } + + static THC_DECL bool lt(const storage_type& a, const storage_type& b) { return m_(a) < m_(b); } + static THC_DECL bool le(const storage_type& a, const storage_type& b) { return m_(a) <= m_(b); } + static THC_DECL bool gt(const storage_type& a, const storage_type& b) { return m_(a) > m_(b); } + static THC_DECL bool ge(const storage_type& a, const storage_type& b) { return m_(a) >= m_(b); } + static THC_DECL bool eq(const storage_type& a, const storage_type& b) { return m_(a) == m_(b); } + static THC_DECL bool ne(const storage_type& a, const storage_type& b) { return m_(a) != m_(b); } + + static THC_DECL expr_type add(const storage_type& a, const storage_type& b) { return e_(m_(a) + m_(b)); } + static THC_DECL expr_type mul(const storage_type& a, const storage_type& b) { return e_(m_(a) * m_(b)); } + static THC_DECL expr_type sub(const storage_type& a, const storage_type& b) { return e_(m_(a) - m_(b)); } + static THC_DECL expr_type div(const storage_type& a, const storage_type& b) { return e_(m_(a) / m_(b)); } + static THC_DECL expr_type abs(const storage_type& a) { bool isneg = (a<0); return e_(isneg ? -a : a); } + static THC_DECL expr_type neg(const storage_type& a) { return e_(-m_(a)); } + static THC_DECL expr_type pow (const storage_type& a, T b) { return e_(::pow((double)a, (double)b)); } + static THC_DECL expr_type mod(const storage_type& a, const storage_type& b) { return e_(m_(a) % m_(b)); } }; @@ -70,7 +75,7 @@ struct THCNumBase : public THCNumCommonBase { template <> struct THCNumBase : public THCNumCommonBase { - static inline __host__ __device__ expr_type abs(const storage_type& a) { return labs(a); } + static THC_DECL expr_type abs(const storage_type& a) { return labs(a); } }; template @@ -83,30 +88,32 @@ struct THCNumBase : public THCNumCommonBase { using typename Base::expr_type; using typename Base::storage_type; - static inline __host__ __device__ expr_type exp (const storage_type& a) { return e_(::exp(m_(a))); } - static inline __host__ __device__ expr_type log (const storage_type& a) { return e_(::log(m_(a))); } - static inline __host__ __device__ expr_type log1p(const storage_type& a) { return e_(::log1p(m_(a))); } - static inline __host__ __device__ expr_type cos (const storage_type& a) { return e_(::cos(m_(a))); } - static inline __host__ __device__ expr_type sin (const storage_type& a) { return e_(::sin(m_(a))); } - static inline __host__ __device__ expr_type sqrt (const storage_type& a) { return e_(::sqrt(m_(a))); } - static inline __host__ __device__ expr_type rsqrt(const storage_type& a) { return e_(::rsqrt(m_(a))); } - static inline __host__ __device__ expr_type ceil (const storage_type& a) { return e_(::ceil(m_(a))); } - static inline __host__ __device__ expr_type floor(const storage_type& a) { return e_(::floor(m_(a))); } - static inline __host__ __device__ expr_type trunc(const storage_type& a) { return e_(::trunc(m_(a))); } - static inline __host__ __device__ expr_type acos (const storage_type& a) { return e_(::acos(m_(a))); } - static inline __host__ __device__ expr_type cosh (const storage_type& a) { return e_(::cosh(m_(a))); } - static inline __host__ __device__ expr_type acosh(const storage_type& a) { return e_(::acosh(m_(a))); } - static inline __host__ __device__ expr_type asin (const storage_type& a) { return e_(::asin(m_(a))); } - static inline __host__ __device__ expr_type sinh (const storage_type& a) { return e_(::sinh(m_(a))); } - static inline __host__ __device__ expr_type asinh(const storage_type& a) { return e_(::asinh(m_(a))); } - static inline __host__ __device__ expr_type tan (const storage_type& a) { return e_(::tan(m_(a))); } - static inline __host__ __device__ expr_type atan (const storage_type& a) { return e_(::atan(m_(a))); } - static inline __host__ __device__ expr_type tanh (const storage_type& a) { return e_(::tanh(m_(a))); } - static inline __host__ __device__ expr_type abs (const storage_type& a) { return e_(::abs(m_(a))); } - static inline __host__ __device__ expr_type round(const storage_type& a) { return e_(::round(m_(a))); } - static inline __host__ __device__ expr_type frac (const storage_type& a) { return e_(m_(a) - ::trunc(m_(a))); } - static inline __host__ __device__ expr_type cinv (const storage_type& a) { return Base::div(THCNumConstants::one(), a); } - static inline __host__ __device__ expr_type pow (const storage_type& a, T b) { return e_(::pow(m_(a), m_(b))); } + static THC_DECL expr_type exp (const storage_type& a) { return e_(::exp(m_(a))); } + static THC_DECL expr_type log (const storage_type& a) { return e_(::log(m_(a))); } + static THC_DECL expr_type log1p(const storage_type& a) { return e_(::log1p(m_(a))); } + static THC_DECL expr_type cos (const storage_type& a) { return e_(::cos(m_(a))); } + static THC_DECL expr_type sin (const storage_type& a) { return e_(::sin(m_(a))); } + static THC_DECL expr_type sqrt (const storage_type& a) { return e_(::sqrt(m_(a))); } + static THC_DECL expr_type rsqrt(const storage_type& a) { return e_(::rsqrt(m_(a))); } + static THC_DECL expr_type ceil (const storage_type& a) { return e_(::ceil(m_(a))); } + static THC_DECL expr_type floor(const storage_type& a) { return e_(::floor(m_(a))); } + static THC_DECL expr_type trunc(const storage_type& a) { return e_(::trunc(m_(a))); } + static THC_DECL expr_type acos (const storage_type& a) { return e_(::acos(m_(a))); } + static THC_DECL expr_type cosh (const storage_type& a) { return e_(::cosh(m_(a))); } + static THC_DECL expr_type acosh(const storage_type& a) { return e_(::acosh(m_(a))); } + static THC_DECL expr_type asin (const storage_type& a) { return e_(::asin(m_(a))); } + static THC_DECL expr_type sinh (const storage_type& a) { return e_(::sinh(m_(a))); } + static THC_DECL expr_type asinh(const storage_type& a) { return e_(::asinh(m_(a))); } + static THC_DECL expr_type tan (const storage_type& a) { return e_(::tan(m_(a))); } + static THC_DECL expr_type atan (const storage_type& a) { return e_(::atan(m_(a))); } + static THC_DECL expr_type tanh (const storage_type& a) { return e_(::tanh(m_(a))); } + static THC_DECL expr_type abs (const storage_type& a) { return e_(::abs(m_(a))); } + static THC_DECL expr_type round(const storage_type& a) { return e_(::round(m_(a))); } + static THC_DECL expr_type frac (const storage_type& a) { return e_(m_(a) - ::trunc(m_(a))); } + static THC_DECL expr_type cinv (const storage_type& a) { return Base::div(THCNumConstants::one(), a); } + static THC_DECL expr_type pow (const storage_type& a, T b) { return e_(::pow(m_(a), m_(b))); } + static THC_DECL expr_type mod (const storage_type& a, const storage_type& b) { return e_(::fmod(m_(a), m_(b))); } + }; template @@ -120,6 +127,7 @@ struct THCNumerics: public THCNumBase::is_integer> #ifdef CUDA_HALF_TENSOR +#ifndef CUDA_HALF_INSTRUCTIONS template <> struct THCNumerics: public THCNumBase { typedef THCNumCommonBase Base; @@ -132,85 +140,99 @@ struct THCNumerics: public THCNumBase { typedef THCNumConstants Constants; }; - -#if TO_BE_CONTINUED_WITH_EXTRA_TEMPLATE_PARAM +#else template <> -struct THCNumerics: public THCNumBase { +struct THCNumerics: public THCNumBase { typedef THCNumCommonBase Base; - using typename Base::math_type; - using typename Base::expr_type; - using typename Base::storage_type; - using Base::e_; - using Base::m_; - using Base::s_; typedef THCNumConstants Constants; - static inline __host__ __device__ bool lt(const half& a, const half& b) { + typedef typename Base::storage_type storage_type; + typedef typename Base::math_type math_type; + typedef typename Base::expr_type expr_type; + static THC_DECL math_type m_(const storage_type& a) { + return ScalarConvert::to(a); + } + static THC_DECL expr_type e_(const math_type& a) { + return ScalarConvert::to(a); + } + static THC_DECL storage_type s_(const expr_type& a) { + return ScalarConvert::to(a); + } + + static THC_DECL bool lt(const half& a, const half& b) { return __hlt(a, b); } - static inline __host__ __device__ bool le(const half& a, const half& b) { + static THC_DECL bool le(const half& a, const half& b) { return __hle(a, b); } - static inline __host__ __device__ bool gt(const half& a, const half& b) { + static THC_DECL bool gt(const half& a, const half& b) { return __hgt(a, b); } - static inline __host__ __device__ bool ge(const half& a, const half& b) { + static THC_DECL bool ge(const half& a, const half& b) { return __hge(a, b); } - static inline __host__ __device__ bool eq(const half& a, const half& b) { + static THC_DECL bool eq(const half& a, const half& b) { return __heq(a, b); } - static inline __host__ __device__ bool ne(const half& a, const half& b) { + static THC_DECL bool ne(const half& a, const half& b) { return __hne(a, b); } - static inline __host__ __device__ half exp(const half& a) { + static THC_DECL half exp(const half& a) { return hexp(a); } - static inline __host__ __device__ half log(const half& a) { + static THC_DECL half log(const half& a) { return hlog(a); } - static inline __host__ __device__ half cos(const half& a) { + static THC_DECL half cos(const half& a) { return hcos(a); } - static inline __host__ __device__ half sin(const half& a) { + static THC_DECL half sin(const half& a) { return hsin(a); } - static inline __host__ __device__ half sqrt(const half& a) { + static THC_DECL half sqrt(const half& a) { return hsqrt(a); } - static inline __host__ __device__ half rsqrt(const half& a) { + static THC_DECL half rsqrt(const half& a) { return hrsqrt(a); } - static inline __host__ __device__ half ceil(const half& a) { + static THC_DECL half ceil(const half& a) { return hceil(a); } - static inline __host__ __device__ half floor(const half& a) { + static THC_DECL half floor(const half& a) { return hfloor(a); } - static inline __host__ __device__ half trunc(const half& a) { + static THC_DECL half trunc(const half& a) { return htrunc(a); } - static inline __host__ __device__ half neg(const half& a) { + static THC_DECL half neg(const half& a) { return __hneg(a); } - static inline __host__ __device__ const half& add(const half& a, const half& b) { + static THC_DECL const half& add(const half& a, const half& b) { return __hadd(a, b); } - static inline __host__ __device__ half mul(const half& a, const half& b) { + static THC_DECL half mul(const half& a, const half& b) { return __hmul(a, b); } - static inline __host__ __device__ half sub(const half& a, const half& b) { + static THC_DECL half sub(const half& a, const half& b) { return __hsub(a, b); } + + static THC_DECL half div (const half& a, const half& b) { + return hdiv(a,b); + } + static THC_DECL half mod (const half& a, const half& b) { + return __float2half(fmodf(__half2float(a), __half2float(b))); + } + }; # endif #endif diff --git a/lib/THC/THCStorage.c b/lib/THC/THCStorage.c index 6fc9574e..669efa82 100644 --- a/lib/THC/THCStorage.c +++ b/lib/THC/THCStorage.c @@ -2,5 +2,7 @@ #include "THCGeneral.h" #include "THAtomic.h" +#include "THCHalf.h" + #include "generic/THCStorage.c" #include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCStorage.cu b/lib/THC/THCStorage.cu index eba9ef66..5555c6f1 100644 --- a/lib/THC/THCStorage.cu +++ b/lib/THC/THCStorage.cu @@ -7,5 +7,7 @@ #include #endif +#include "THCHalf.h" + #include "generic/THCStorage.cu" #include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCStorage.h b/lib/THC/THCStorage.h index 6a27fa34..ac1cd709 100644 --- a/lib/THC/THCStorage.h +++ b/lib/THC/THCStorage.h @@ -1,9 +1,8 @@ #ifndef THC_STORAGE_INC #define THC_STORAGE_INC -#include "THCGeneral.h" - #include "THStorage.h" +#include "THCGeneral.h" #define THCStorage TH_CONCAT_3(TH,CReal,Storage) #define THCStorage_(NAME) TH_CONCAT_4(TH,CReal,Storage_,NAME) diff --git a/lib/THC/THCStorageCopy.c b/lib/THC/THCStorageCopy.c index 6dcb06c0..ee9bf815 100644 --- a/lib/THC/THCStorageCopy.c +++ b/lib/THC/THCStorageCopy.c @@ -1,4 +1,5 @@ #include "THCStorageCopy.h" + #include "THCTensorCopy.h" #include "generic/THCStorageCopy.c" diff --git a/lib/THC/THCStorageCopy.cu b/lib/THC/THCStorageCopy.cu index 68113dbc..56641888 100644 --- a/lib/THC/THCStorageCopy.cu +++ b/lib/THC/THCStorageCopy.cu @@ -1,5 +1,5 @@ -#include "THCGeneral.h" #include "THCStorageCopy.h" +#include "THCGeneral.h" #include "THCHalf.h" #include "THCTensorCopy.h" diff --git a/lib/THC/THCStorageCopy.h b/lib/THC/THCStorageCopy.h index ec8011d3..837056fc 100644 --- a/lib/THC/THCStorageCopy.h +++ b/lib/THC/THCStorageCopy.h @@ -3,6 +3,7 @@ #include "THCStorage.h" #include "THCGeneral.h" +#include "THCHalf.h" #include "generic/THCStorageCopy.h" #include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCTensor.h b/lib/THC/THCTensor.h index 49fbf947..e2521b11 100644 --- a/lib/THC/THCTensor.h +++ b/lib/THC/THCTensor.h @@ -1,10 +1,9 @@ #ifndef THC_TENSOR_INC #define THC_TENSOR_INC -#include "THCGeneral.h" #include "THTensor.h" #include "THCStorage.h" - +#include "THCGeneral.h" #define THCTensor TH_CONCAT_3(TH,CReal,Tensor) #define THCTensor_(NAME) TH_CONCAT_4(TH,CReal,Tensor_,NAME) diff --git a/lib/THC/THCTensorCopy.cu b/lib/THC/THCTensorCopy.cu index 2ae3cfe9..e6b35677 100644 --- a/lib/THC/THCTensorCopy.cu +++ b/lib/THC/THCTensorCopy.cu @@ -1,4 +1,5 @@ #include "THCApply.cuh" +#include "THCHalf.h" #include "THCNumerics.cuh" inline int curGPU() { diff --git a/lib/THC/THCTensorCopy.h b/lib/THC/THCTensorCopy.h index fc206cb7..e8bc4f4b 100644 --- a/lib/THC/THCTensorCopy.h +++ b/lib/THC/THCTensorCopy.h @@ -3,6 +3,7 @@ #include "THCTensor.h" #include "THCGeneral.h" +#include "THCHalf.h" #include "generic/THCTensorCopy.h" #include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCTensorIndex.cu b/lib/THC/THCTensorIndex.cu index 1fe3683f..415e6256 100644 --- a/lib/THC/THCTensorIndex.cu +++ b/lib/THC/THCTensorIndex.cu @@ -1,10 +1,10 @@ -#include "THCGeneral.h" - +#include "THC.h" #include "THCTensorMath.h" +#include "THCGeneral.h" #include "THCBlas.h" #include "THCTensorCopy.h" #include "THCTensorRandom.h" - +#include "THCHalf.h" #include "THCApply.cuh" #include "THCReduce.cuh" #include "THCDeviceUtils.cuh" diff --git a/lib/THC/THCTensorMath.h b/lib/THC/THCTensorMath.h index e074a3a7..19ae679c 100644 --- a/lib/THC/THCTensorMath.h +++ b/lib/THC/THCTensorMath.h @@ -2,6 +2,7 @@ #define TH_CUDA_TENSOR_MATH_INC #include "THCTensor.h" +#include "THCGeneral.h" #include "generic/THCTensorMath.h" #include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCTensorMathPairwise.cu b/lib/THC/THCTensorMathPairwise.cu index 90b6fb3b..1511fb88 100644 --- a/lib/THC/THCTensorMathPairwise.cu +++ b/lib/THC/THCTensorMathPairwise.cu @@ -111,7 +111,7 @@ struct TensorRemainderOp { __device__ __forceinline__ void operator()(half* out, half* in) { #ifdef CUDA_HALF_INSTRUCTIONS - *out = __hsub(*in, __hmul(val, hfloor(__hdiv(*in, val)))); + *out = __hsub(*in, __hmul(val, hfloor(hdiv(*in, val)))); #else float fin = __half2float(*in); float fout = fin - fval * floorf(fin / fval); @@ -121,7 +121,7 @@ struct TensorRemainderOp { __device__ __forceinline__ void operator()(half* v) { #ifdef CUDA_HALF_INSTRUCTIONS - *v = __hsub(*v, __hmul(val, hfloor(__hdiv(*v, val)))); + *v = __hsub(*v, __hmul(val, hfloor(hdiv(*v, val)))); #else float fv = __half2float(*v); fv = fv - fval * floorf(fv / fval); diff --git a/lib/THC/THCTensorMathPointwise.cuh b/lib/THC/THCTensorMathPointwise.cuh index 87ab8b41..f9499c5e 100644 --- a/lib/THC/THCTensorMathPointwise.cuh +++ b/lib/THC/THCTensorMathPointwise.cuh @@ -175,7 +175,7 @@ template <> struct TensorCRemainderOp { __device__ __forceinline__ void operator()(half* out, half* in) { #ifdef CUDA_HALF_INSTRUCTIONS - *out = __hsub(*out, __hmul(*in, hfloor(__hdiv(*out, *in)))); + *out = __hsub(*out, __hmul(*in, hfloor(hdiv(*out, *in)))); #else float fout = __half2float(*out); float fin = __half2float(*in); @@ -185,7 +185,7 @@ struct TensorCRemainderOp { __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { #ifdef CUDA_HALF_INSTRUCTIONS - *out = __hsub(*in1, __hmul(*in2, hfloor(__hdiv(*in1, *in2)))); + *out = __hsub(*in1, __hmul(*in2, hfloor(hdiv(*in1, *in2)))); #else float fin1 = __half2float(*in1); float fin2 = __half2float(*in2); @@ -197,49 +197,14 @@ struct TensorCRemainderOp { template struct TensorCFmodOp { - __device__ __forceinline__ void operator()(T* out, T* in) { - *out = *out % *in; - } - + typedef THCNumerics N_; __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { - *out = *in1 % *in2; - } -}; - -template <> -struct TensorCFmodOp { - __device__ __forceinline__ void operator()(float* out, float* in) { - *out = fmodf(*out, *in); - } - - __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) { - *out = fmodf(*in1, *in2); + *out = N_::s_(N_::mod(*in1,*in2)); } -}; - -template <> -struct TensorCFmodOp { - __device__ __forceinline__ void operator()(double* out, double* in) { - *out = fmod(*out, *in); - } - - __device__ __forceinline__ void operator()(double* out, double* in1, double* in2) { - *out = fmod(*in1, *in2); - } -}; - -#ifdef CUDA_HALF_TENSOR -template <> -struct TensorCFmodOp { - __device__ __forceinline__ void operator()(half* out, half* in) { - *out = __float2half(fmodf(__half2float(*out), __half2float(*in))); - } - - __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { - *out = __float2half(fmodf(__half2float(*in1), __half2float(*in2))); + __device__ __forceinline__ void operator()(T* out, T* in) { + this->operator()(out, out, in); } }; -#endif // CUDA_HALF_TENSOR template struct TensorClampOp { diff --git a/lib/THC/THCTensorTypeUtils.cu b/lib/THC/THCTensorTypeUtils.cu index 43a2f4bb..bdcbcbec 100644 --- a/lib/THC/THCTensorTypeUtils.cu +++ b/lib/THC/THCTensorTypeUtils.cu @@ -1,6 +1,7 @@ #include "THCTensorTypeUtils.cuh" #include "THCTensor.h" #include "THCTensorCopy.h" +#include "THCHalf.h" #include namespace { diff --git a/lib/THC/THCTensorTypeUtils.cuh b/lib/THC/THCTensorTypeUtils.cuh index 7c32eeca..273606e5 100644 --- a/lib/THC/THCTensorTypeUtils.cuh +++ b/lib/THC/THCTensorTypeUtils.cuh @@ -4,6 +4,7 @@ #include #include #include "THCGeneral.h" +#include "THCHalf.h" #include "THCTensor.h" #include "THCTensorInfo.cuh" @@ -74,7 +75,7 @@ TENSOR_UTILS(THCudaCharTensor, char, long); TENSOR_UTILS(THCudaShortTensor, short, long); TENSOR_UTILS(THCudaIntTensor, int, long); TENSOR_UTILS(THCudaLongTensor, long, long); -TENSOR_UTILS(THCudaTensor, float, double); +TENSOR_UTILS(THCudaTensor, float, float); TENSOR_UTILS(THCudaDoubleTensor, double, double); #ifdef CUDA_HALF_TENSOR @@ -125,4 +126,55 @@ getTensorInfo(THCState* state, TensorType* t) { TensorUtils::getData(state, t), dims, sz, st); } +template +struct ScalarNegate { + static __host__ __device__ T to(const T v) { return -v; } +}; + +template +struct ScalarInv { + static __host__ __device__ T to(const T v) { return ((T) 1) / v; } +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct ScalarNegate { + static __host__ __device__ half to(const half v) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return __hneg(v); +#else + return __float2half(-__half2float(v)); +#endif +#else + half out = v; + out.x ^= 0x8000; // toggle sign bit + return out; +#endif + } +}; + +template <> +struct ScalarInv { + static __host__ __device__ half to(const half v) { +#ifdef __CUDA_ARCH__ + return __float2half(1.0f / __half2float(v)); +#else + float fv = THC_half2float(v); + fv = 1.0f / fv; + return THC_float2half(fv); +#endif + } +}; + +inline bool operator==(half a, half b) { + return a.x == b.x; +} + +inline bool operator!=(half a, half b) { + return a.x != b.x; +} + +#endif // CUDA_HALF_TENSOR + #endif // THC_TENSOR_TYPE_UTILS_INC diff --git a/lib/THC/generic/THCStorageCopy.c b/lib/THC/generic/THCStorageCopy.c index 517df27c..13069069 100644 --- a/lib/THC/generic/THCStorageCopy.c +++ b/lib/THC/generic/THCStorageCopy.c @@ -25,6 +25,7 @@ TH_CUDA_STORAGE_IMPLEMENT_COPY(Short) TH_CUDA_STORAGE_IMPLEMENT_COPY(Int) TH_CUDA_STORAGE_IMPLEMENT_COPY(Long) TH_CUDA_STORAGE_IMPLEMENT_COPY(Float) +TH_CUDA_STORAGE_IMPLEMENT_COPY(Half) TH_CUDA_STORAGE_IMPLEMENT_COPY(Double) void THStorage_(copyCuda)(THCState *state, THStorage *self, struct THCStorage *src) @@ -44,7 +45,6 @@ void TH_CONCAT_4(TH,TYPEC,Storage_copyCuda,Real)(THCState *state, TH##TYPEC##Sto THCTensor_(free)(state, srcTensor); \ TH##TYPEC##Tensor_free(selfTensor); \ } - TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Byte) TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Char) TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Short) @@ -56,7 +56,5 @@ TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Double) #undef TH_CUDA_STORAGE_IMPLEMENT_COPY #undef TH_CUDA_STORAGE_IMPLEMENT_COPYTO -#undef TH_CUDA_STORAGE_IMPLEMENT_COPY_AUX -#undef TH_CUDA_STORAGE_IMPLEMENT_COPYTO_AUX #endif diff --git a/lib/THC/generic/THCStorageCopy.h b/lib/THC/generic/THCStorageCopy.h index f5311094..7a4ef6bb 100644 --- a/lib/THC/generic/THCStorageCopy.h +++ b/lib/THC/generic/THCStorageCopy.h @@ -22,7 +22,6 @@ THC_API void THCStorage_(copyCudaInt)(THCState *state, THCStorage *storage, stru THC_API void THCStorage_(copyCudaLong)(THCState *state, THCStorage *storage, struct THCudaLongStorage *src); THC_API void THCStorage_(copyCudaFloat)(THCState *state, THCStorage *storage, struct THCudaStorage *src); THC_API void THCStorage_(copyCudaDouble)(THCState *state, THCStorage *storage, struct THCudaDoubleStorage *src); - #ifdef CUDA_HALF_TENSOR THC_API void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *storage, struct THCudaHalfStorage *src); #endif diff --git a/lib/THC/generic/THCTensorCopy.h b/lib/THC/generic/THCTensorCopy.h index 270d16e8..e549f09b 100644 --- a/lib/THC/generic/THCTensorCopy.h +++ b/lib/THC/generic/THCTensorCopy.h @@ -32,7 +32,6 @@ THC_API void TH_CONCAT_2(THLongTensor_copyCuda , Real) (THCState *state, THLon THC_API void TH_CONCAT_2(THFloatTensor_copyCuda , Real) (THCState *state, THFloatTensor *self, THCTensor *src); THC_API void TH_CONCAT_2(THDoubleTensor_copyCuda, Real) (THCState *state, THDoubleTensor *self, THCTensor *src); THC_API void TH_CONCAT_2(THHalfTensor_copyCuda, Real) (THCState *state, THHalfTensor *self, THCTensor *src); - THC_API void THCTensor_(copyCuda) (THCState *state, THCTensor *self, THCTensor *src); THC_API void THTensor_(copyCuda) (THCState *state, THTensor *self, THCTensor *src); diff --git a/lib/THC/generic/THCTensorMathBlas.cu b/lib/THC/generic/THCTensorMathBlas.cu index 63c9989f..545971f9 100644 --- a/lib/THC/generic/THCTensorMathBlas.cu +++ b/lib/THC/generic/THCTensorMathBlas.cu @@ -36,7 +36,7 @@ THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src) #else THError("unimplemented data type"); - return ScalarConvert::to(0); + return 0; #endif } diff --git a/lib/THC/generic/THCTensorMathPairwise.cu b/lib/THC/generic/THCTensorMathPairwise.cu index 0b4094b8..b7e82021 100644 --- a/lib/THC/generic/THCTensorMathPairwise.cu +++ b/lib/THC/generic/THCTensorMathPairwise.cu @@ -63,7 +63,7 @@ THC_API void THCTensor_(div)(THCState* state, THCTensor *self_, THCTensor *src_, real value) { THAssert(THCTensor_(checkGPU)(state, 2, self_, src_)); - THArgCheck(value != ScalarConvert::to(0), 3, "divide by zero"); + THArgCheck(value != THCNumConstants::zero(), 3, "divide by zero"); if (self_ == src_) { if (!THC_pointwiseApply1(state, self_, TensorDivConstantOp(value))) { diff --git a/lib/THC/generic/THCTensorMathPointwise.cu b/lib/THC/generic/THCTensorMathPointwise.cu index 755b1a34..b97908a2 100644 --- a/lib/THC/generic/THCTensorMathPointwise.cu +++ b/lib/THC/generic/THCTensorMathPointwise.cu @@ -263,7 +263,7 @@ THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, real value, // self += -value * src2 if (!THC_pointwiseApply2(state, self_, src2, TensorCAddOp( - THCNumerics::neg(value)))) { + ScalarNegate::to(value)))) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } } @@ -279,7 +279,7 @@ THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, real value, // self = src1 - value * src2 if (!THC_pointwiseApply3(state, self_, src1, src2, TensorCAddOp( - THCNumerics::neg(value)))) { + ScalarNegate::to(value)))) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } } diff --git a/lib/THC/generic/THCTensorMathReduce.cu b/lib/THC/generic/THCTensorMathReduce.cu index aad7bceb..ed0e2049 100644 --- a/lib/THC/generic/THCTensorMathReduce.cu +++ b/lib/THC/generic/THCTensorMathReduce.cu @@ -301,7 +301,7 @@ THCTensor_(minall)(THCState *state, THCTensor *self) { thrust::identity(), ReduceMin(), ReduceMin(), - THCNumConstants::storage_type>::max(), &val, 0)) { + THCNumerics::max(), &val, 0)) { THArgCheck(false, 1, CUTORCH_DIM_WARNING); } @@ -317,7 +317,7 @@ THCTensor_(maxall)(THCState *state, THCTensor *self) { thrust::identity(), ReduceMax(), ReduceMax(), - THCNumConstants::storage_type>::min(), &val, 0)) { + THCNumerics::min(), &val, 0)) { THArgCheck(false, 1, CUTORCH_DIM_WARNING); } @@ -336,7 +336,7 @@ THCTensor_(max)(THCState *state, thrust::pair::DataType, long> init = thrust::make_pair::DataType, long>( - THCNumConstants::DataType>::storage_type>::min(), 1); + THCNumerics::DataType>::min(), 1); return THC_reduceDimIndex( state, values, indices, src, dimension, init, @@ -354,7 +354,7 @@ THCTensor_(min)(THCState *state, thrust::pair::DataType, long> init = thrust::make_pair::DataType, long>( - THCNumConstants::DataType>::storage_type>::max(), 1); + THCNumerics::DataType>::max(), 1); return THC_reduceDimIndex( state, values, indices, src, dimension, init, From 18f5ce19c42a7981782a4c6168d4e435ffa9b20e Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Sun, 15 Jan 2017 15:31:38 -0800 Subject: [PATCH 12/13] cleanup --- FFI.lua | 14 ++--- Tensor.lua | 1 + lib/THC/THCHalf.c | 91 ---------------------------- lib/THC/THCNumerics.cuh | 33 +++++----- lib/THC/generic/THCTensorMathBlas.cu | 2 +- 5 files changed, 20 insertions(+), 121 deletions(-) delete mode 100644 lib/THC/THCHalf.c diff --git a/FFI.lua b/FFI.lua index fb67c89b..b2777a2b 100644 --- a/FFI.lua +++ b/FFI.lua @@ -47,16 +47,10 @@ cudaStream_t THCState_getCurrentStream(THCState *state); {'long','Long'}, {'double','Double'}, } - if cutorch.hasHalf then - if cutorch.hasFastHalfInstructions() then - -- Enable native half math on Pascal plaforms where fp16 is efficient (GP100) - table.insert(CudaTypes, {'half','Half'}) - else - -- on the rest (Maxwell and Pascal 6.1 (1080), resort to storage-only - -- (a.k.a 'pseudo') fp16 type (16-bit storage, float math via conversions) - table.insert(CudaTypes, {'half','Half', 'float'}) - end - end + if cutorch.hasHalf then + table.insert(CudaTypes, {'half','Half'}) + end + for _, typedata in ipairs(CudaTypes) do local real, Real = unpack(typedata) local ctype_def = [[ diff --git a/Tensor.lua b/Tensor.lua index a2724d54..0029291b 100644 --- a/Tensor.lua +++ b/Tensor.lua @@ -88,3 +88,4 @@ for ValueType, CudaTensorType in pairs(CudaTensorTypes) do end rawset(torch.getmetatable(CudaTensorType), 'totable', Tensor__totable) end + diff --git a/lib/THC/THCHalf.c b/lib/THC/THCHalf.c deleted file mode 100644 index 0f567674..00000000 --- a/lib/THC/THCHalf.c +++ /dev/null @@ -1,91 +0,0 @@ -#include "THCHalf.h" - -float THC_half2float(half a) -{ - unsigned int bits = a.x & 0x7fff; - unsigned int sign = a.x & 0x8000; - unsigned int exp = a.x & 0x7c00; - - bits <<= 13; - sign <<= 16; - - bits += 0x38000000U; - - // flush denormals to 0 - bits = (exp == 0 ? 0 : bits) | sign; - - union { - float f; - unsigned int v; - } conv; - conv.v = bits; - - return conv.f; -} - -/* - Copyright (c) 2015, Norbert Juffa - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -half THC_float2half(float a) -{ - uint32_t ia; - uint16_t ir; - memcpy(&ia, &a, sizeof(float)); - - ir = (ia >> 16) & 0x8000; - if ((ia & 0x7f800000) == 0x7f800000) { - if ((ia & 0x7fffffff) == 0x7f800000) { - ir |= 0x7c00; /* infinity */ - } else { - ir = 0x7fff; /* canonical NaN */ - } - } else if ((ia & 0x7f800000) >= 0x33000000) { - int shift = (int)((ia >> 23) & 0xff) - 127; - if (shift > 15) { - ir |= 0x7c00; /* infinity */ - } else { - ia = (ia & 0x007fffff) | 0x00800000; /* extract mantissa */ - if (shift < -14) { /* denormal */ - ir |= ia >> (-1 - shift); - ia = ia << (32 - (-1 - shift)); - } else { /* normal */ - ir |= ia >> (24 - 11); - ia = ia << (32 - (24 - 11)); - ir = ir + ((14 + shift) << 10); - } - /* IEEE-754 round to nearest of even */ - if ((ia > 0x80000000) || ((ia == 0x80000000) && (ir & 1))) { - ir++; - } - } - } - - half ret; - memcpy(&ret, &ir, sizeof(half)); - return ret; -} diff --git a/lib/THC/THCNumerics.cuh b/lib/THC/THCNumerics.cuh index a5d558c1..6b504e77 100644 --- a/lib/THC/THCNumerics.cuh +++ b/lib/THC/THCNumerics.cuh @@ -10,8 +10,6 @@ struct THCNumConstants { static THC_DECL const T one() { return T(1); } static THC_DECL const T zero() { return T(0); } - static THC_DECL const T min() { return std::numeric_limits::min(); } - static THC_DECL const T max() { return std::numeric_limits::max(); } }; @@ -20,8 +18,6 @@ struct THCNumConstants { static THC_DECL const half one() { half ret = THC_FLOAT_TO_HALF(1.f); return ret;} /* TODO: use literal */ static THC_DECL const half zero() { half ret; ret.x = 0; return ret;} - static THC_DECL const half min() { half ret; ret.x = 0xFBFF; return ret; } - static THC_DECL const half max() { half ret; ret.x = 0x7BFF; return ret; } }; template @@ -45,9 +41,6 @@ struct THCNumCommonBase { return ScalarConvert::to(a); } - static THC_DECL const T min() { return THCNumConstants::min(); } - static THC_DECL const T max() { return THCNumConstants::max(); } - static THC_DECL bool lt(const storage_type& a, const storage_type& b) { return m_(a) < m_(b); } static THC_DECL bool le(const storage_type& a, const storage_type& b) { return m_(a) <= m_(b); } static THC_DECL bool gt(const storage_type& a, const storage_type& b) { return m_(a) > m_(b); } @@ -63,7 +56,8 @@ struct THCNumCommonBase { static THC_DECL expr_type neg(const storage_type& a) { return e_(-m_(a)); } static THC_DECL expr_type pow (const storage_type& a, T b) { return e_(::pow((double)a, (double)b)); } static THC_DECL expr_type mod(const storage_type& a, const storage_type& b) { return e_(m_(a) % m_(b)); } - + static THC_DECL const T min() { return std::numeric_limits::min(); } + static THC_DECL const T max() { return std::numeric_limits::max(); } }; template @@ -88,6 +82,10 @@ struct THCNumBase : public THCNumCommonBase { using typename Base::expr_type; using typename Base::storage_type; + static THC_DECL const T min() { return -std::numeric_limits::max(); } + static THC_DECL const T max() { return std::numeric_limits::max(); } + + static THC_DECL expr_type exp (const storage_type& a) { return e_(::exp(m_(a))); } static THC_DECL expr_type log (const storage_type& a) { return e_(::log(m_(a))); } static THC_DECL expr_type log1p(const storage_type& a) { return e_(::log1p(m_(a))); } @@ -127,9 +125,13 @@ struct THCNumerics: public THCNumBase::is_integer> #ifdef CUDA_HALF_TENSOR -#ifndef CUDA_HALF_INSTRUCTIONS template <> struct THCNumerics: public THCNumBase { + static THC_DECL const half min() { half ret; ret.x = 0xFBFF; return ret; } + static THC_DECL const half max() { half ret; ret.x = 0x7BFF; return ret; } + typedef THCNumConstants Constants; + +# ifndef CUDA_HALF_INSTRUCTIONS typedef THCNumCommonBase Base; using typename Base::math_type; using typename Base::expr_type; @@ -137,15 +139,8 @@ struct THCNumerics: public THCNumBase { using Base::e_; using Base::m_; using Base::s_; - typedef THCNumConstants Constants; -}; - -#else -template <> -struct THCNumerics: public THCNumBase { +# else typedef THCNumCommonBase Base; - typedef THCNumConstants Constants; - typedef typename Base::storage_type storage_type; typedef typename Base::math_type math_type; typedef typename Base::expr_type expr_type; @@ -232,9 +227,9 @@ struct THCNumerics: public THCNumBase { static THC_DECL half mod (const half& a, const half& b) { return __float2half(fmodf(__half2float(a), __half2float(b))); } - -}; # endif +}; + #endif #endif // THC_NUMERICS_INC diff --git a/lib/THC/generic/THCTensorMathBlas.cu b/lib/THC/generic/THCTensorMathBlas.cu index 545971f9..63c9989f 100644 --- a/lib/THC/generic/THCTensorMathBlas.cu +++ b/lib/THC/generic/THCTensorMathBlas.cu @@ -36,7 +36,7 @@ THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src) #else THError("unimplemented data type"); - return 0; + return ScalarConvert::to(0); #endif } From b6771be0b8f72f1ff4fba98dd9d88700cb0eeb0d Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Sun, 15 Jan 2017 16:03:40 -0800 Subject: [PATCH 13/13] Enabling half instructions for selected Pascal archs --- lib/THC/THCHalf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h index 5e35694e..dbeb4311 100644 --- a/lib/THC/THCHalf.h +++ b/lib/THC/THCHalf.h @@ -41,7 +41,7 @@ THC_API int THC_fastHalfInstructions(THCState *state); # define THC_DECL inline # endif -#if 0 // __CUDA_ARCH__ == 600 || __CUDA_ARCH__ >= 620 +#if __CUDA_ARCH__ == 600 || __CUDA_ARCH__ >= 620 # define CUDA_HALF_INSTRUCTIONS 1 #endif