From b928ca019ea061cb9ebaca2dd81450aa24f25268 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Fri, 30 Sep 2016 02:03:56 -0700
Subject: [PATCH 01/13] temporarily force pseudo-fp16 mode
 (HAS_HALF_INSTRUCTIONS=FALSE) for Pascal:
 https://github.com/torch/cutorch/issues/520

---
 init.c             |  4 ++++
 lib/THC/THCHalf.cu |  4 ++++
 lib/THC/THCHalf.h  | 16 ++++++++++++++--
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/init.c b/init.c
index 82f9a823..c92982cc 100644
--- a/init.c
+++ b/init.c
@@ -1032,6 +1032,10 @@ int luaopen_libcutorch(lua_State *L)
 #endif
   lua_setfield(L, -2, "hasHalf");
 
+  /* true fp16 vs pseudo-fp16 mode: this one is per device */
+  lua_pushboolean(L, THC_nativeHalfInstructions(state));
+  lua_setfield(L, -2, "hasHalfInstructions");
+
   /* store gpu driver version in field */
   int driverVersion;
   THCudaCheck(cudaDriverGetVersion(&driverVersion));
diff --git a/lib/THC/THCHalf.cu b/lib/THC/THCHalf.cu
index 7777bf76..f2484d4a 100644
--- a/lib/THC/THCHalf.cu
+++ b/lib/THC/THCHalf.cu
@@ -121,10 +121,14 @@ half THC_float2half(float a)
 }
 
 THC_EXTERNC int THC_nativeHalfInstructions(THCState *state) {
+#if CUDA_HALF_INSTRUCTIONS
   cudaDeviceProp* prop =
     THCState_getCurrentDeviceProperties(state);
 
   // CC 5.3+
   return (prop->major > 5 ||
           (prop->major == 5 && prop->minor == 3));
+#else
+        return false;
+#endif
 }
diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h
index afa83e47..b360d948 100644
--- a/lib/THC/THCHalf.h
+++ b/lib/THC/THCHalf.h
@@ -8,9 +8,21 @@
 #define CUDA_HALF_TENSOR 1
 #endif
 
+/* This define forces use of 32-bit float math on 16-bit float type 'half' (a.k.a. "pseudo-fp16 mode")
+   even if native harware support is available.
+   This makes difference for Pascal (6.x) cards only: Maxwell (5.x) cards always run 'half' in pseudo mode.
+   !!! Uncomment on your own risk !!!
+   Native fp16 operations may in fact run slower than pseudo-fp16 on your system at the moment
+  (especially if the bulk of your code is in CUDNN and not Cutorch).
+*/
+
+#define FORCE_PSEUDO_FP16 1
+
+#ifndef FORCE_PSEUDO_FP16
 /* Kernel side: Native fp16 ALU instructions are available if we have this: */
-#if defined(CUDA_HALF_TENSOR) && (CUDA_VERSION >= 8000) && (__CUDA_ARCH__ >= 530)
-#define CUDA_HALF_INSTRUCTIONS 1
+# if defined(CUDA_HALF_TENSOR) && (CUDA_VERSION >= 8000) && (__CUDA_ARCH__ >= 530)
+#  define CUDA_HALF_INSTRUCTIONS 1
+# endif
 #endif
 
 #ifdef CUDA_HALF_TENSOR

From a80d28bbc90b0aae3bd8335d3dae2a8956dcecb6 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Wed, 12 Oct 2016 17:41:43 -0700
Subject: [PATCH 02/13] checkpoint

---
 lib/THC/THCHalf.h       | 63 +++++++++++++++++++++++++++++++++---
 lib/THC/THCNumerics.cuh | 72 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 130 insertions(+), 5 deletions(-)

diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h
index 76271bcc..616ec62e 100644
--- a/lib/THC/THCHalf.h
+++ b/lib/THC/THCHalf.h
@@ -26,18 +26,71 @@
 #endif
 
 #ifdef CUDA_HALF_TENSOR
-
 #include <cuda_fp16.h>
 #include <stdint.h>
 
-THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len);
-THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len);
-THC_API half THC_float2half(float a);
-THC_API float THC_half2float(half a);
+/* CPU emulation */
+THC_EXTERNC half THC_float2half(float a);
+THC_EXTERNC float THC_half2float(half a);
+
+#if defined (__CUDA_ARCH__)
+# define THC_FLOAT_TO_HALF(x) __float2half((float)x)
+# define THC_HALF_TO_FLOAT(x) __half2float((float)x)
+#else
+# define THC_FLOAT_TO_HALF(x) THC_float2half((float)x)
+# define THC_HALF_TO_FLOAT(x) THC_half2float((float)x)
+#endif
 
 /* Check for native fp16 support on the current device (CC 5.3+) */
 THC_EXTERNC int THC_nativeHalfInstructions(THCState *state);
 
+__host__ __device__ __forceinline__ bool operator==(const half& a, const half& b) {
+  return a.x == b.x;
+}
+
+__host__ __device__ __forceinline__ bool operator!=(const half& a, const half& b) {
+  return a.x != b.x;
+}
+
 #endif /* CUDA_HALF_TENSOR */
 
+#ifdef __CUDA_ARCH__
+//
+// host (CPU) routines
+//
+THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, long len);
+THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, long len);
+
+/// `half` has some type conversion issues associated with it, since it
+/// is a struct without a constructor/implicit conversion constructor.
+/// We use this to convert scalar values to the given type that the
+/// tensor expects.
+
+template <typename In, typename Out>
+struct ScalarConvert {
+  static inline __host__ __device__ Out to(const In& v) { return Out(v); }
+};
+
+template <typename Out>
+struct ScalarConvert<half, Out> {
+  static __host__ __device__ __forceinline__ Out to(const half& v) {
+    return (Out) THC_HALF_TO_FLOAT(v);
+  }
+};
+
+template <typename In>
+struct ScalarConvert<In, half> {
+  static __host__ __device__ __forceinline__ half to(const In& v) {
+    return THC_FLOAT_TO_HALF(v);
+  }
+};
+
+template <>
+struct ScalarConvert<half, half> {
+  static __host__ __device__ __forceinline__ half to(const half& v) {
+    return v;
+  }
+};
+#endif /* CUDA */
+
 #endif
diff --git a/lib/THC/THCNumerics.cuh b/lib/THC/THCNumerics.cuh
index a9d78978..09443605 100644
--- a/lib/THC/THCNumerics.cuh
+++ b/lib/THC/THCNumerics.cuh
@@ -26,6 +26,9 @@ struct THCNumerics<unsigned char> {
   static inline __host__ __device__ bool ne(unsigned char a, unsigned char b) { return a != b; }
 
   static inline __host__ __device__  unsigned char add(unsigned char a, unsigned char b) { return a + b; }
+  static inline __host__ __device__  unsigned char mul(unsigned char a, unsigned char b) { return a * b; }
+  static inline __host__ __device__  unsigned char sub(unsigned char a, unsigned char b) { return a - b; }
+  static inline __host__ __device__  unsigned char div(unsigned char a, unsigned char b) { return a / b; }
   static inline __host__ __device__  unsigned char abs(unsigned char a) { return abs(a); }
 };
 
@@ -42,6 +45,9 @@ struct THCNumerics<char> {
   static inline __host__ __device__ bool ne(char a, char b) { return a != b; }
 
   static inline __host__ __device__  char add(char a, char b) { return a + b; }
+  static inline __host__ __device__  char mul(char a, char b) { return a * b; }
+  static inline __host__ __device__  char sub(char a, char b) { return a - b; }
+  static inline __host__ __device__  char div(char a, char b) { return a / b; }
   static inline __host__ __device__  char abs(char a) { return abs(a); }
 };
 
@@ -58,6 +64,9 @@ struct THCNumerics<short> {
   static inline __host__ __device__ bool ne(short a, short b) { return a != b; }
 
   static inline __host__ __device__  short add(short a, short b) { return a + b; }
+  static inline __host__ __device__  short mul(short a, short b) { return a * b; }
+  static inline __host__ __device__  short sub(short a, short b) { return a - b; }
+  static inline __host__ __device__  short div(short a, short b) { return a / b; }
   static inline __host__ __device__  short abs(short a) { return abs(a); }
 };
 
@@ -74,6 +83,9 @@ struct THCNumerics<int> {
   static inline __host__ __device__ bool ne(int a, int b) { return a != b; }
 
   static inline __host__ __device__  int add(int a, int b) { return a + b; }
+  static inline __host__ __device__  int mul(int a, int b) { return a * b; }
+  static inline __host__ __device__  int sub(int a, int b) { return a - b; }
+  static inline __host__ __device__  int div(int a, int b) { return a / b; }
   static inline __host__ __device__  int abs(int a) { return ::abs(a); }
 };
 
@@ -90,6 +102,9 @@ struct THCNumerics<long> {
   static inline __host__ __device__ bool ne(long a, long b) { return a != b; }
 
   static inline __host__ __device__  long add(long a, long b) { return a + b; }
+  static inline __host__ __device__  long mul(long a, long b) { return a * b; }
+  static inline __host__ __device__  long sub(long a, long b) { return a - b; }
+  static inline __host__ __device__  long div(long a, long b) { return a / b; };
   static inline __host__ __device__  long abs(long a) { return labs(a); }
 };
 
@@ -435,6 +450,55 @@ struct THCNumerics<half> {
     return THC_float2half(THC_half2float(a) + THC_half2float(b));
 #endif
   }
+
+  static inline __host__ __device__ half div(half a, half b) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return __float2half( fa / fb );
+#else // __CUDA_ARCH__
+    return THC_float2half(THC_half2float(a) / THC_half2float(b));
+#endif
+  }
+
+  static inline __host__ __device__ half mul(half a, half b) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hmul(a, b);
+#else
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return __float2half( fa * fb );
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(THC_half2float(a) * THC_half2float(b));
+#endif
+  }
+
+  static inline __host__ __device__ half sub(half a, half b) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hsub(a, b);
+#else
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return __float2half( fa - fb );
+#endif
+#else // __CUDA_ARCH__
+    return THC_float2half(THC_half2float(a) - THC_half2float(b));
+#endif
+  }
+
+  static inline __host__ __device__ half pow(half a, half b) {
+#ifdef __CUDA_ARCH__
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return __float2half(powf(fa, fb));
+#else // __CUDA_ARCH__
+    return THC_float2half(powf(THC_half2float(a), THC_half2float(b)));
+#endif
+  }
+
 };
 #endif
 
@@ -475,6 +539,10 @@ struct THCNumerics<float> {
   static inline __host__ __device__  float frac (float a) { return a - truncf(a); }
   static inline __host__ __device__  float cinv (float a) { return 1.0f / a; }
   static inline __host__ __device__  float add  (float a, float b) { return a + b; }
+  static inline __host__ __device__  float div  (float a, float b) { return a / b; }
+  static inline __host__ __device__  float mul  (float a, float b) { return a * b; }
+  static inline __host__ __device__  float sub  (float a, float b) { return a - b; }
+  static inline __host__ __device__  float pow  (float a, float b) { return powf(a, b); }
 };
 
 template <>
@@ -514,6 +582,10 @@ struct THCNumerics<double> {
   static inline __host__ __device__  double frac (double a) { return a - ::trunc(a); }
   static inline __host__ __device__  double cinv (double a) { return 1.0 / a; }
   static inline __host__ __device__  double add  (double a, double b) { return a + b; }
+  static inline __host__ __device__  double div  (double a, double b) { return a / b; }
+  static inline __host__ __device__  double mul  (double a, double b) { return a * b; }
+  static inline __host__ __device__  double sub  (double a, double b) { return a - b; }
+  static inline __host__ __device__  double pow  (double a, double b) { return ::pow(a, b); }
 };
 
 /// `half` has some type conversion issues associated with it, since it

From ac964ee15f1cf434bdcb674958abaecc5018e7ba Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Fri, 14 Oct 2016 01:03:47 -0700
Subject: [PATCH 03/13] checkpoint

---
 lib/THC/THCHalf.cu                 | 103 -----
 lib/THC/THCHalf.h                  |  87 ++--
 lib/THC/THCNumerics.cuh            | 690 ++++++-----------------------
 lib/THC/THCTensorMathPointwise.cuh |  39 +-
 4 files changed, 191 insertions(+), 728 deletions(-)

diff --git a/lib/THC/THCHalf.cu b/lib/THC/THCHalf.cu
index 46092b73..4f317102 100644
--- a/lib/THC/THCHalf.cu
+++ b/lib/THC/THCHalf.cu
@@ -29,106 +29,3 @@ void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len) {
 #endif
     in, in + len, out, __half2floatOp());
 }
-
-float THC_half2float(half a)
-{
-  unsigned int bits = a.x & 0x7fff;
-  unsigned int sign = a.x & 0x8000;
-  unsigned int exp = a.x & 0x7c00;
-
-  bits <<= 13;
-  sign <<= 16;
-
-  bits += 0x38000000U;
-
-  // flush denormals to 0
-  bits = (exp == 0 ? 0 : bits) | sign;
-
-  union {
-    float f;
-    unsigned int v;
-  } conv;
-  conv.v = bits;
-
-  return conv.f;
-}
-
-/*
-  Copyright (c) 2015, Norbert Juffa
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-  1. Redistributions of source code must retain the above copyright
-     notice, this list of conditions and the following disclaimer.
-
-  2. Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the following disclaimer in the
-     documentation and/or other materials provided with the distribution.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-half THC_float2half(float a)
-{
-  uint32_t ia;
-  uint16_t ir;
-  memcpy(&ia, &a, sizeof(float));
-
-  ir = (ia >> 16) & 0x8000;
-  if ((ia & 0x7f800000) == 0x7f800000) {
-    if ((ia & 0x7fffffff) == 0x7f800000) {
-      ir |= 0x7c00; /* infinity */
-    } else {
-      ir = 0x7fff; /* canonical NaN */
-    }
-  } else if ((ia & 0x7f800000) >= 0x33000000) {
-    int shift = (int)((ia >> 23) & 0xff) - 127;
-    if (shift > 15) {
-      ir |= 0x7c00; /* infinity */
-    } else {
-      ia = (ia & 0x007fffff) | 0x00800000; /* extract mantissa */
-      if (shift < -14) { /* denormal */
-        ir |= ia >> (-1 - shift);
-        ia = ia << (32 - (-1 - shift));
-      } else { /* normal */
-        ir |= ia >> (24 - 11);
-        ia = ia << (32 - (24 - 11));
-        ir = ir + ((14 + shift) << 10);
-      }
-      /* IEEE-754 round to nearest of even */
-      if ((ia > 0x80000000) || ((ia == 0x80000000) && (ir & 1))) {
-        ir++;
-      }
-    }
-  }
-
-  half ret;
-  memcpy(&ret, &ir, sizeof(half));
-  return ret;
-}
-
-THC_EXTERNC int THC_nativeHalfInstructions(THCState *state) {
-#if CUDA_HALF_INSTRUCTIONS
-  cudaDeviceProp* prop =
-    THCState_getCurrentDeviceProperties(state);
-
-  // CC 5.3+
-  return (prop->major > 5 ||
-          (prop->major == 5 && prop->minor == 3));
-#else
-        return false;
-#endif
-}
diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h
index 616ec62e..11c3bb94 100644
--- a/lib/THC/THCHalf.h
+++ b/lib/THC/THCHalf.h
@@ -1,12 +1,12 @@
 #ifndef THC_HALF_CONVERSION_INC
-#define THC_HALF_CONVERSION_INC
+# define THC_HALF_CONVERSION_INC
 
-#include "THCGeneral.h"
+# include "THCGeneral.h"
 
 /* We compile with CudaHalfTensor support if we have this: */
-#if CUDA_VERSION >= 7050 || CUDA_HAS_FP16
-#define CUDA_HALF_TENSOR 1
-#endif
+# if CUDA_VERSION >= 7050 || CUDA_HAS_FP16
+#  define CUDA_HALF_TENSOR 1
+# endif
 
 /* This define forces use of 32-bit float math on 16-bit float type 'half' (a.k.a. "pseudo-fp16 mode")
    even if native harware support is available.
@@ -16,30 +16,54 @@
   (especially if the bulk of your code is in CUDNN and not Cutorch).
 */
 
-#define FORCE_PSEUDO_FP16 1
+# define FORCE_PSEUDO_FP16 1
 
-#ifndef FORCE_PSEUDO_FP16
+# ifndef FORCE_PSEUDO_FP16
 /* Kernel side: Native fp16 ALU instructions are available if we have this: */
-# if defined(CUDA_HALF_TENSOR) && (CUDA_VERSION >= 8000) && (__CUDA_ARCH__ >= 530)
-#  define CUDA_HALF_INSTRUCTIONS 1
+#  if defined(CUDA_HALF_TENSOR) && (CUDA_VERSION >= 8000) && (__CUDA_ARCH__ >= 530)
+#   define CUDA_HALF_INSTRUCTIONS 1
+#  endif
 # endif
-#endif
 
-#ifdef CUDA_HALF_TENSOR
-#include <cuda_fp16.h>
-#include <stdint.h>
+# ifdef CUDA_HALF_TENSOR
+
+#  include <cuda_fp16.h>
+#  include <stdint.h>
 
-/* CPU emulation */
-THC_EXTERNC half THC_float2half(float a);
+/* CPU conversion methods, scalar */
+THC_EXTERNC half  THC_float2half(float a);
 THC_EXTERNC float THC_half2float(half a);
 
-#if defined (__CUDA_ARCH__)
-# define THC_FLOAT_TO_HALF(x) __float2half((float)x)
-# define THC_HALF_TO_FLOAT(x) __half2float((float)x)
-#else
-# define THC_FLOAT_TO_HALF(x) THC_float2half((float)x)
-# define THC_HALF_TO_FLOAT(x) THC_half2float((float)x)
-#endif
+//
+// Vector conversion routines, using Thrust
+//
+THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, long len);
+THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, long len);
+
+#  if defined (__CUDA_ARCH__)
+/* use instrintic functons defined for device only in cuda_fp16.h */
+#   define THC_FLOAT_TO_HALF(x) __float2half((float)x)
+#   define THC_HALF_TO_FLOAT(x) __half2float((float)x)
+#  else
+/* use host conversion functions */
+#   define THC_FLOAT_TO_HALF(x) THC_float2half((float)x)
+#   define THC_HALF_TO_FLOAT(x) THC_half2float((float)x)
+#  endif
+
+/* Basic wrapper for 'half */
+struct Half: public half {
+  const half& val() const { return *this; }
+  half& val() { return *this; }
+  const half& operator half() const { return val(); }
+  Half(const half& v): half(v) {}
+  Half(const Half& v): half(v.val) {}
+};
+
+
+/* "pseudo-fp16" type: 16-bit storage, float math */
+struct PseudoHalf: public Half {
+  PseudoHalf(const Half& v): Half(v.val) {}
+};
 
 /* Check for native fp16 support on the current device (CC 5.3+) */
 THC_EXTERNC int THC_nativeHalfInstructions(THCState *state);
@@ -52,15 +76,6 @@ __host__ __device__ __forceinline__ bool operator!=(const half& a, const half& b
   return a.x != b.x;
 }
 
-#endif /* CUDA_HALF_TENSOR */
-
-#ifdef __CUDA_ARCH__
-//
-// host (CPU) routines
-//
-THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, long len);
-THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, long len);
-
 /// `half` has some type conversion issues associated with it, since it
 /// is a struct without a constructor/implicit conversion constructor.
 /// We use this to convert scalar values to the given type that the
@@ -68,12 +83,12 @@ THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, long len);
 
 template <typename In, typename Out>
 struct ScalarConvert {
-  static inline __host__ __device__ Out to(const In& v) { return Out(v); }
+  static inline __host__ __device__ const Out& to(const In& v) { return Out(v); }
 };
 
 template <typename Out>
 struct ScalarConvert<half, Out> {
-  static __host__ __device__ __forceinline__ Out to(const half& v) {
+  static __host__ __device__ __forceinline__ const Out& to(const half& v) {
     return (Out) THC_HALF_TO_FLOAT(v);
   }
 };
@@ -87,10 +102,10 @@ struct ScalarConvert<In, half> {
 
 template <>
 struct ScalarConvert<half, half> {
-  static __host__ __device__ __forceinline__ half to(const half& v) {
+  static __host__ __device__ __forceinline__ const half& to(const half& v) {
     return v;
   }
 };
-#endif /* CUDA */
 
-#endif
+# endif /* CUDA_HALF_TENSOR */
+#endif /* THC_HALF_CONVERSION_INC */
diff --git a/lib/THC/THCNumerics.cuh b/lib/THC/THCNumerics.cuh
index 09443605..5817d59d 100644
--- a/lib/THC/THCNumerics.cuh
+++ b/lib/THC/THCNumerics.cuh
@@ -2,630 +2,208 @@
 #define THC_NUMERICS_INC
 
 #include <cuda.h>
-#include <limits.h>
+#include <limits>
 #include "THCHalf.h"
 
-/// Class for numeric limits of the particular data type, which
-/// includes support for `half`.
-/// Unfortunately since `half` does not have a constructor, these have
-/// to be expressed as functions (either that or non-const statics).
+using std::numeric_limits;
+
+template <typename T, bool is_int>
+struct THCNumericsBase {
+};
+
 template <typename T>
-struct THCNumerics {
+struct THC_math_traits
+{
+   typedef T storage_type;
+   /* type value should be converted to before doing math on it.
+      For most types, MathType==StorageType.
+   */
+   typedef T math_type;
+   /* type of expression , like (a*b). Usually == MathType */
+   typedef T expr_type;
 };
 
+/* default handling for bare half is pseudo */
 template <>
-struct THCNumerics<unsigned char> {
-  static inline __host__ __device__ unsigned char min() { return 0; }
-  static inline __host__ __device__ unsigned char max() { return UCHAR_MAX; }
-
-  static inline __host__ __device__ bool lt(unsigned char a, unsigned char b) { return a < b; }
-  static inline __host__ __device__ bool le(unsigned char a, unsigned char b) { return a <= b; }
-  static inline __host__ __device__ bool gt(unsigned char a, unsigned char b) { return a > b; }
-  static inline __host__ __device__ bool ge(unsigned char a, unsigned char b) { return a >= b; }
-  static inline __host__ __device__ bool eq(unsigned char a, unsigned char b) { return a == b; }
-  static inline __host__ __device__ bool ne(unsigned char a, unsigned char b) { return a != b; }
-
-  static inline __host__ __device__  unsigned char add(unsigned char a, unsigned char b) { return a + b; }
-  static inline __host__ __device__  unsigned char mul(unsigned char a, unsigned char b) { return a * b; }
-  static inline __host__ __device__  unsigned char sub(unsigned char a, unsigned char b) { return a - b; }
-  static inline __host__ __device__  unsigned char div(unsigned char a, unsigned char b) { return a / b; }
-  static inline __host__ __device__  unsigned char abs(unsigned char a) { return abs(a); }
-};
+struct THC_math_traits<half>
+{
+   typedef half storage_type;
+   typedef float math_type;
+   typedef float expr_type;
+}
 
 template <>
-struct THCNumerics<char> {
-  static inline __host__ __device__ char min() { return CHAR_MIN; }
-  static inline __host__ __device__ char max() { return CHAR_MAX; }
-
-  static inline __host__ __device__ bool lt(char a, char b) { return a < b; }
-  static inline __host__ __device__ bool le(char a, char b) { return a <= b; }
-  static inline __host__ __device__ bool gt(char a, char b) { return a > b; }
-  static inline __host__ __device__ bool ge(char a, char b) { return a >= b; }
-  static inline __host__ __device__ bool eq(char a, char b) { return a == b; }
-  static inline __host__ __device__ bool ne(char a, char b) { return a != b; }
-
-  static inline __host__ __device__  char add(char a, char b) { return a + b; }
-  static inline __host__ __device__  char mul(char a, char b) { return a * b; }
-  static inline __host__ __device__  char sub(char a, char b) { return a - b; }
-  static inline __host__ __device__  char div(char a, char b) { return a / b; }
-  static inline __host__ __device__  char abs(char a) { return abs(a); }
-};
+struct THC_math_traits<PseudoHalf>
+{
+   typedef half storage_type;
+   typedef float math_type;
+   typedef float expr_type;
+}
 
 template <>
-struct THCNumerics<short> {
-  static inline __host__ __device__ short min() { return SHRT_MIN; }
-  static inline __host__ __device__ short max() { return SHRT_MAX; }
-
-  static inline __host__ __device__ bool lt(short a, short b) { return a < b; }
-  static inline __host__ __device__ bool le(short a, short b) { return a <= b; }
-  static inline __host__ __device__ bool gt(short a, short b) { return a > b; }
-  static inline __host__ __device__ bool ge(short a, short b) { return a >= b; }
-  static inline __host__ __device__ bool eq(short a, short b) { return a == b; }
-  static inline __host__ __device__ bool ne(short a, short b) { return a != b; }
-
-  static inline __host__ __device__  short add(short a, short b) { return a + b; }
-  static inline __host__ __device__  short mul(short a, short b) { return a * b; }
-  static inline __host__ __device__  short sub(short a, short b) { return a - b; }
-  static inline __host__ __device__  short div(short a, short b) { return a / b; }
-  static inline __host__ __device__  short abs(short a) { return abs(a); }
+struct THC_math_traits<Half>
+{
+   typedef half storage_type;
+   typedef half math_type;
+   typedef half expr_type;
+}
+
+/// Class for numeric limits of the particular data type, which
+/// includes support for `half`.
+template <typename T>
+struct THCNumericsBase<T, true> {
+
+  typedef THC_math_traits<T> traits;
+  typedef typename traits::storage_type storage_type;
+  typedef typename traits::math_type math_type;
+  typedef typename traits::expr_type expr_type;
+
+  static const math_type one = 1;
+  static const math_type zero = 0;
+
+  static __host__ __device__ __forceinline__ const math_type& m_(const storage_type& a) {
+    return ScalarConvert<storage_type, math_type>::to(a);
+  }
+  static __host__ __device__ __forceinline__ const expr_type& e_(const math_type& a) {
+    return ScalarConvert<math_type, expr_type>::to(a);
+  }
+
+  static inline __host__ __device__ expr_type min() { return e_(numeric_limits<T>::min()); }
+  static inline __host__ __device__ expt_type max() { return e_(numeric_limits<T>::max()); }
+
+  static inline __host__ __device__ bool lt(const storage_type&  a, const storage_type&  b) { return m_(a) < m_(b);  }
+  static inline __host__ __device__ bool le(const storage_type&  a, const storage_type&  b) { return m_(a) <= m_(b); }
+  static inline __host__ __device__ bool gt(const storage_type&  a, const storage_type&  b) { return m_(a) > m_(b);  }
+  static inline __host__ __device__ bool ge(const storage_type&  a, const storage_type&  b) { return m_(a) >= m_(b); }
+  static inline __host__ __device__ bool eq(const storage_type&  a, const storage_type&  b) { return m_(a) == m_(b); }
+  static inline __host__ __device__ bool ne(const storage_type&  a, const storage_type&  b) { return m_(a) != m_(b); }
+
+  static inline __host__ __device__  const expr_type&  add(const storage_type&  a, const storage_type&  b) { return e_(m_(a) + m_(b)); }
+  static inline __host__ __device__  const expr_type&  mul(const storage_type&  a, const storage_type&  b) { return e_(m_(a) * m_(b)); }
+  static inline __host__ __device__  const expr_type&  sub(const storage_type&  a, const storage_type&  b) { return e_(m_(a) - m_(b)); }
+  static inline __host__ __device__  const expr_type&  div(const storage_type&  a, const storage_type&  b) { return e_(m_(a) / m_(b)); }
+  static inline __host__ __device__  const expr_type&  abs(const storage_type&  a) { return e_(abs(m_(a))); }
 };
 
-template <>
-struct THCNumerics<int> {
-  static inline __host__ __device__ int min() { return INT_MIN; }
-  static inline __host__ __device__ int max() { return INT_MAX; }
-
-  static inline __host__ __device__ bool lt(int a, int b) { return a < b; }
-  static inline __host__ __device__ bool le(int a, int b) { return a <= b; }
-  static inline __host__ __device__ bool gt(int a, int b) { return a > b; }
-  static inline __host__ __device__ bool ge(int a, int b) { return a >= b; }
-  static inline __host__ __device__ bool eq(int a, int b) { return a == b; }
-  static inline __host__ __device__ bool ne(int a, int b) { return a != b; }
-
-  static inline __host__ __device__  int add(int a, int b) { return a + b; }
-  static inline __host__ __device__  int mul(int a, int b) { return a * b; }
-  static inline __host__ __device__  int sub(int a, int b) { return a - b; }
-  static inline __host__ __device__  int div(int a, int b) { return a / b; }
-  static inline __host__ __device__  int abs(int a) { return ::abs(a); }
+template <typename T>
+struct THCNumericsBase<T, false>: public THCNumericsBase<T, true> {
+
+  static const math_type one = 1.0;
+  static const math_type zero = 0.;
+
+  static inline __host__ __device__  expr_type exp  (const storage_type& a) { return  e_(::exp(m_(a)); }
+  static inline __host__ __device__  expr_type log  (const storage_type& a) { return  e_(::log(m_(a))); }
+  static inline __host__ __device__  expr_type log1p(const storage_type& a) { return  e_(::log1p(m_(a))); }
+  static inline __host__ __device__  expr_type cos  (const storage_type& a) { return  e_(::cos(m_(a))); }
+  static inline __host__ __device__  expr_type sin  (const storage_type& a) { return  e_(::sin(m_(a))); }
+  static inline __host__ __device__  expr_type sqrt (const storage_type& a) { return  e_(::sqrt(m_(a))); }
+  static inline __host__ __device__  expr_type rsqrt(const storage_type& a) { return  e_(::rsqrt(m_(a))); }
+  static inline __host__ __device__  expr_type ceil (const storage_type& a) { return  e_(::ceil(m_(a))); }
+  static inline __host__ __device__  expr_type floor(const storage_type& a) { return  e_(::floor(m_(a))); }
+  static inline __host__ __device__  expr_type trunc(const storage_type& a) { return  e_(::trunc(m_(a))); }
+  static inline __host__ __device__  expr_type neg  (const storage_type& a) { return  e_(-m_(a)); }
+  static inline __host__ __device__  expr_type acos (const storage_type& a) { return  e_(::acos(m_(a))); }
+  static inline __host__ __device__  expr_type cosh (const storage_type& a) { return  e_(::cosh(m_(a))); }
+  static inline __host__ __device__  expr_type acosh(const storage_type& a) { return  e_(::acosh(m_(a))); }
+  static inline __host__ __device__  expr_type asin (const storage_type& a) { return  e_(::asin(m_(a))); }
+  static inline __host__ __device__  expr_type sinh (const storage_type& a) { return  e_(::sinh(m_(a))); }
+  static inline __host__ __device__  expr_type asinh(const storage_type& a) { return  e_(::asinh(m_(a))); }
+  static inline __host__ __device__  expr_type tan  (const storage_type& a) { return  e_(::tan(m_(a))); }
+  static inline __host__ __device__  expr_type atan (const storage_type& a) { return  e_(::atan(m_(a))); }
+  static inline __host__ __device__  expr_type tanh (const storage_type& a) { return  e_(::tanh(m_(a))); }
+  static inline __host__ __device__  expr_type abs  (const storage_type& a) { return  e_(::abs(m_(a))); }
+  static inline __host__ __device__  expr_type round(const storage_type& a) { return  e_(::round(m_(a))); }
+  static inline __host__ __device__  expr_type frac (const storage_type& a) { return  e_(m_(a) - ::trunc(m_(a))); }
+  static inline __host__ __device__  expr_type cinv (const storage_type& a) { return  e_(one / m_(m_(a)))); }
+  static inline __host__ __device__  expr_type pow  (const storage_type& a, T b) { return e_(::pow(a, b)); }
 };
 
+template <typename T>
+struct THCNumerics: public THCNumericsBase<T, numeric_limits<T>::is_integer> {
+};
+
+/* do we need this ? */
 template <>
-struct THCNumerics<long> {
-  static inline __host__ __device__ long min() { return LONG_MIN; }
-  static inline __host__ __device__ long max() { return LONG_MAX; }
-
-  static inline __host__ __device__ bool lt(long a, long b) { return a < b; }
-  static inline __host__ __device__ bool le(long a, long b) { return a <= b; }
-  static inline __host__ __device__ bool gt(long a, long b) { return a > b; }
-  static inline __host__ __device__ bool ge(long a, long b) { return a >= b; }
-  static inline __host__ __device__ bool eq(long a, long b) { return a == b; }
-  static inline __host__ __device__ bool ne(long a, long b) { return a != b; }
-
-  static inline __host__ __device__  long add(long a, long b) { return a + b; }
-  static inline __host__ __device__  long mul(long a, long b) { return a * b; }
-  static inline __host__ __device__  long sub(long a, long b) { return a - b; }
-  static inline __host__ __device__  long div(long a, long b) { return a / b; };
+struct THCNumerics<long> : public THCNumericsBase<long, true> {
   static inline __host__ __device__  long abs(long a) { return labs(a); }
 };
 
+
 #ifdef CUDA_HALF_TENSOR
+
 template <>
-struct THCNumerics<half> {
-  static inline __host__ __device__ half min() { half h; h.x = 0xfbff; return h; }
-  static inline __host__ __device__ half max() { half h; h.x = 0x7bff; return h; }
+struct THCNumerics<PseudoHalf>: public THCNumericsBase<PseudoHalf, false>  {
+};
 
-  static inline __host__ __device__ bool lt(half a, half b) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+template <>
+struct THCNumerics<Half>: public THCNumericsBase<Half, false>  {
+  static inline __host__ __device__ bool lt(const half& a, const half& b) {
     return __hlt(a, b);
-#else
-    float fa = __half2float(a);
-    float fb = __half2float(b);
-    return fa < fb;
-#endif
-#else // __CUDA_ARCH__
-    return THC_half2float(a) < THC_half2float(b);
-#endif
   }
-
-  static inline __host__ __device__ bool le(half a, half b) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static inline __host__ __device__ bool le(const half& a, const half& b) {
     return __hle(a, b);
-#else
-    float fa = __half2float(a);
-    float fb = __half2float(b);
-    return fa <= fb;
-#endif
-#else // __CUDA_ARCH__
-    return THC_half2float(a) <= THC_half2float(b);
-#endif
   }
 
-  static inline __host__ __device__ bool gt(half a, half b) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static inline __host__ __device__ bool gt(const half& a, const half& b) {
     return __hgt(a, b);
-#else
-    float fa = __half2float(a);
-    float fb = __half2float(b);
-    return fa > fb;
-#endif
-#else // __CUDA_ARCH__
-    return THC_half2float(a) > THC_half2float(b);
-#endif
   }
 
-  static inline __host__ __device__ bool ge(half a, half b) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static inline __host__ __device__ bool ge(const half& a, const half& b) {
     return __hge(a, b);
-#else
-    float fa = __half2float(a);
-    float fb = __half2float(b);
-    return fa >= fb;
-#endif
-#else // __CUDA_ARCH__
-    return THC_half2float(a) >= THC_half2float(b);
-#endif
   }
 
-  static inline __host__ __device__ bool eq(half a, half b) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static inline __host__ __device__ bool eq(const half& a, const half& b) {
     return __heq(a, b);
-#else
-    float fa = __half2float(a);
-    float fb = __half2float(b);
-    return fa == fb;
-#endif
-#else // __CUDA_ARCH__
-    return THC_half2float(a) == THC_half2float(b);
-#endif
   }
 
-  static inline __host__ __device__ bool ne(half a, half b) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static inline __host__ __device__ bool ne(const half& a, const half& b) {
     return __hne(a, b);
-#else
-    float fa = __half2float(a);
-    float fb = __half2float(b);
-    return fa != fb;
-#endif
-#else // __CUDA_ARCH__
-    return THC_half2float(a) != THC_half2float(b);
-#endif
   }
-
-  static inline __host__ __device__ half exp(half a) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static inline __host__ __device__ half exp(const half& a) {
     return hexp(a);
-#else
-    float fa = __half2float(a);
-    return __float2half(expf(fa));
-#endif
-#else // __CUDA_ARCH__
-    return THC_float2half(expf(THC_half2float(a)));
-#endif
   }
-
-  static inline __host__ __device__ half log(half a) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static inline __host__ __device__ half log(const half& a) {
     return hlog(a);
-#else
-    float fa = __half2float(a);
-    return __float2half(logf(fa));
-#endif
-#else // __CUDA_ARCH__
-    return THC_float2half(logf(THC_half2float(a)));
-#endif
   }
-
-  static inline __host__ __device__ half log1p(half a) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    return __float2half(log1pf(fa));
-#else // __CUDA_ARCH__
-    return THC_float2half(log1pf(THC_half2float(a)));
-#endif
-  }
-
-  static inline __host__ __device__ half cos(half a) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static inline __host__ __device__ half cos(const half& a) {
     return hcos(a);
-#else
-    float fa = __half2float(a);
-    return __float2half(cosf(fa));
-#endif
-#else // __CUDA_ARCH__
-    return THC_float2half(cosf(THC_half2float(a)));
-#endif
   }
-
-  static inline __host__ __device__ half sin(half a) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static inline __host__ __device__ half sin(const half& a) {
     return hsin(a);
-#else
-    float fa = __half2float(a);
-    return __float2half(sinf(fa));
-#endif
-#else // __CUDA_ARCH__
-    return THC_float2half(sinf(THC_half2float(a)));
-#endif
   }
-
-  static inline __host__ __device__ half sqrt(half a) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static inline __host__ __device__ half sqrt(const half& a) {
     return hsqrt(a);
-#else
-    float fa = __half2float(a);
-    return __float2half(sqrtf(fa));
-#endif
-#else // __CUDA_ARCH__
-    return THC_float2half(sqrtf(THC_half2float(a)));
-#endif
   }
-
-  static inline __host__ __device__ half rsqrt(half a) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static inline __host__ __device__ half rsqrt(const half& a) {
     return hrsqrt(a);
-#else
-    float fa = __half2float(a);
-    return __float2half(rsqrtf(fa));
-#endif
-#else // __CUDA_ARCH__
-    return THC_float2half(rsqrtf(THC_half2float(a)));
-#endif
   }
-
-  static inline __host__ __device__ half ceil(half a) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static inline __host__ __device__ half ceil(const half& a) {
     return hceil(a);
-#else
-    float fa = __half2float(a);
-    return __float2half(ceilf(fa));
-#endif
-#else // __CUDA_ARCH__
-    return THC_float2half(ceilf(THC_half2float(a)));
-#endif
   }
 
-  static inline __host__ __device__ half floor(half a) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static inline __host__ __device__ half floor(const half& a) {
     return hfloor(a);
-#else
-    float fa = __half2float(a);
-    return __float2half(floorf(fa));
-#endif
-#else // __CUDA_ARCH__
-    return THC_float2half(floorf(THC_half2float(a)));
-#endif
   }
 
-  static inline __host__ __device__ half trunc(half a) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static inline __host__ __device__ half trunc(const half& a) {
     return htrunc(a);
-#else
-    float fa = __half2float(a);
-    return __float2half(truncf(fa));
-#endif
-#else // __CUDA_ARCH__
-    return THC_float2half(truncf(THC_half2float(a)));
-#endif
   }
 
-  static inline __host__ __device__ half neg(half a) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static inline __host__ __device__ half neg(const half& a) {
     return __hneg(a);
-#else
-    float fa = __half2float(a);
-    return __float2half(-fa);
-#endif
-#else // __CUDA_ARCH__
-    return THC_float2half(-(THC_half2float(a)));
-#endif
-  }
-
-  static inline __host__ __device__ half acos(half a) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    return __float2half(acosf(fa));
-#else // __CUDA_ARCH__
-    return THC_float2half(acosf(THC_half2float(a)));
-#endif
-  }
-
-  static inline __host__ __device__ half cosh(half a) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    return __float2half(coshf(fa));
-#else // __CUDA_ARCH__
-    return THC_float2half(coshf(THC_half2float(a)));
-#endif
-  }
-
-  static inline __host__ __device__ half asin(half a) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    return __float2half(asinf(fa));
-#else // __CUDA_ARCH__
-    return THC_float2half(asinf(THC_half2float(a)));
-#endif
   }
 
-  static inline __host__ __device__ half sinh(half a) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    return __float2half(sinhf(fa));
-#else // __CUDA_ARCH__
-    return THC_float2half(sinhf(THC_half2float(a)));
-#endif
-  }
-
-  static inline __host__ __device__ half tan(half a) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    return __float2half(tanf(fa));
-#else // __CUDA_ARCH__
-    return THC_float2half(tanf(THC_half2float(a)));
-#endif
-  }
-
-  static inline __host__ __device__ half atan(half a) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    return __float2half(atanf(fa));
-#else // __CUDA_ARCH__
-    return THC_float2half(atanf(THC_half2float(a)));
-#endif
-  }
-
-  static inline __host__ __device__ half tanh(half a) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    return __float2half(tanhf(fa));
-#else // __CUDA_ARCH__
-    return THC_float2half(tanhf(THC_half2float(a)));
-#endif
-  }
-
-  static inline __host__ __device__ half abs(half a) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    return __float2half(fabs(fa));
-#else // __CUDA_ARCH__
-    return THC_float2half(fabs(THC_half2float(a)));
-#endif
-  }
-
-  static inline __host__ __device__ half round(half a) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    return __float2half(roundf(fa));
-#else // __CUDA_ARCH__
-    return THC_float2half(roundf(THC_half2float(a)));
-#endif
-  }
-
-  static inline __host__ __device__ half frac(half a) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    return __float2half(fa - truncf(fa));
-#else // __CUDA_ARCH__
-    float fa = THC_half2float(a);
-    return THC_float2half(fa - floorf(fa));
-#endif
-  }
-
-  static inline __host__ __device__ half cinv(half a) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    return __float2half(1.0f / fa);
-#else // __CUDA_ARCH__
-    return THC_float2half(1.0f / THC_half2float(a));
-#endif
-  }
-
-  static inline __host__ __device__ half add(half a, half b) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static inline __host__ __device__ const half& add(const half& a, const half& b) {
     return __hadd(a, b);
-#else
-    float fa = __half2float(a);
-    float fb = __half2float(b);
-    return __float2half( fa + fb );
-#endif
-#else // __CUDA_ARCH__
-    return THC_float2half(THC_half2float(a) + THC_half2float(b));
-#endif
   }
-
-  static inline __host__ __device__ half div(half a, half b) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    float fb = __half2float(b);
-    return __float2half( fa / fb );
-#else // __CUDA_ARCH__
-    return THC_float2half(THC_half2float(a) / THC_half2float(b));
-#endif
-  }
-
-  static inline __host__ __device__ half mul(half a, half b) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static inline __host__ __device__ half mul(const half& a, const half& b) {
     return __hmul(a, b);
-#else
-    float fa = __half2float(a);
-    float fb = __half2float(b);
-    return __float2half( fa * fb );
-#endif
-#else // __CUDA_ARCH__
-    return THC_float2half(THC_half2float(a) * THC_half2float(b));
-#endif
   }
 
-  static inline __host__ __device__ half sub(half a, half b) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
+  static inline __host__ __device__ half sub(const half& a, const half& b) {
     return __hsub(a, b);
-#else
-    float fa = __half2float(a);
-    float fb = __half2float(b);
-    return __float2half( fa - fb );
-#endif
-#else // __CUDA_ARCH__
-    return THC_float2half(THC_half2float(a) - THC_half2float(b));
-#endif
-  }
-
-  static inline __host__ __device__ half pow(half a, half b) {
-#ifdef __CUDA_ARCH__
-    float fa = __half2float(a);
-    float fb = __half2float(b);
-    return __float2half(powf(fa, fb));
-#else // __CUDA_ARCH__
-    return THC_float2half(powf(THC_half2float(a), THC_half2float(b)));
-#endif
-  }
-
-};
-#endif
-
-template <>
-struct THCNumerics<float> {
-  static inline __host__ __device__ float min() { return -FLT_MAX; }
-  static inline __host__ __device__ float max() { return FLT_MAX; }
-
-  static inline __host__ __device__ bool lt(float a, float b) { return a < b; }
-  static inline __host__ __device__ bool le(float a, float b) { return a <= b; }
-  static inline __host__ __device__ bool gt(float a, float b) { return a > b; }
-  static inline __host__ __device__ bool ge(float a, float b) { return a >= b; }
-  static inline __host__ __device__ bool eq(float a, float b) { return a == b; }
-  static inline __host__ __device__ bool ne(float a, float b) { return a != b; }
-
-  static inline __host__ __device__  float exp  (float a) { return   expf(a); }
-  static inline __host__ __device__  float log  (float a) { return   logf(a); }
-  static inline __host__ __device__  float log1p(float a) { return log1pf(a); }
-  static inline __host__ __device__  float cos  (float a) { return   cosf(a); }
-  static inline __host__ __device__  float sin  (float a) { return   sinf(a); }
-  static inline __host__ __device__  float sqrt (float a) { return  sqrtf(a); }
-  static inline __host__ __device__  float rsqrt(float a) { return rsqrtf(a); }
-  static inline __host__ __device__  float ceil (float a) { return  ceilf(a); }
-  static inline __host__ __device__  float floor(float a) { return floorf(a); }
-  static inline __host__ __device__  float trunc(float a) { return truncf(a); }
-  static inline __host__ __device__  float neg  (float a) { return        -a; }
-  static inline __host__ __device__  float acos (float a) { return  acosf(a); }
-  static inline __host__ __device__  float cosh (float a) { return  coshf(a); }
-  static inline __host__ __device__  float acosh(float a) { return acoshf(a); }
-  static inline __host__ __device__  float asin (float a) { return  asinf(a); }
-  static inline __host__ __device__  float sinh (float a) { return  sinhf(a); }
-  static inline __host__ __device__  float asinh(float a) { return asinhf(a); }
-  static inline __host__ __device__  float tan  (float a) { return   tanf(a); }
-  static inline __host__ __device__  float atan (float a) { return  atanf(a); }
-  static inline __host__ __device__  float tanh (float a) { return  tanhf(a); }
-  static inline __host__ __device__  float abs  (float a) { return   fabs(a); }
-  static inline __host__ __device__  float round(float a) { return roundf(a); }
-  static inline __host__ __device__  float frac (float a) { return a - truncf(a); }
-  static inline __host__ __device__  float cinv (float a) { return 1.0f / a; }
-  static inline __host__ __device__  float add  (float a, float b) { return a + b; }
-  static inline __host__ __device__  float div  (float a, float b) { return a / b; }
-  static inline __host__ __device__  float mul  (float a, float b) { return a * b; }
-  static inline __host__ __device__  float sub  (float a, float b) { return a - b; }
-  static inline __host__ __device__  float pow  (float a, float b) { return powf(a, b); }
-};
-
-template <>
-struct THCNumerics<double> {
-  static inline __host__ __device__ double min() { return -DBL_MAX; }
-  static inline __host__ __device__ double max() { return DBL_MAX; }
-
-  static inline __host__ __device__ bool lt(double a, double b) { return a < b; }
-  static inline __host__ __device__ bool le(double a, double b) { return a <= b; }
-  static inline __host__ __device__ bool gt(double a, double b) { return a > b; }
-  static inline __host__ __device__ bool ge(double a, double b) { return a >= b; }
-  static inline __host__ __device__ bool eq(double a, double b) { return a == b; }
-  static inline __host__ __device__ bool ne(double a, double b) { return a != b; }
-
-  static inline __host__ __device__  double exp  (double a) { return   ::exp(a); }
-  static inline __host__ __device__  double log  (double a) { return   ::log(a); }
-  static inline __host__ __device__  double log1p(double a) { return ::log1p(a); }
-  static inline __host__ __device__  double cos  (double a) { return   ::cos(a); }
-  static inline __host__ __device__  double sin  (double a) { return   ::sin(a); }
-  static inline __host__ __device__  double sqrt (double a) { return  ::sqrt(a); }
-  static inline __host__ __device__  double rsqrt(double a) { return ::rsqrt(a); }
-  static inline __host__ __device__  double ceil (double a) { return  ::ceil(a); }
-  static inline __host__ __device__  double floor(double a) { return ::floor(a); }
-  static inline __host__ __device__  double trunc(double a) { return ::trunc(a); }
-  static inline __host__ __device__  double neg  (double a) { return       -a; }
-  static inline __host__ __device__  double acos (double a) { return  ::acos(a); }
-  static inline __host__ __device__  double cosh (double a) { return  ::cosh(a); }
-  static inline __host__ __device__  double acosh(double a) { return ::acosh(a); }
-  static inline __host__ __device__  double asin (double a) { return  ::asin(a); }
-  static inline __host__ __device__  double sinh (double a) { return  ::sinh(a); }
-  static inline __host__ __device__  double asinh(double a) { return ::asinh(a); }
-  static inline __host__ __device__  double tan  (double a) { return   ::tan(a); }
-  static inline __host__ __device__  double atan (double a) { return  ::atan(a); }
-  static inline __host__ __device__  double tanh (double a) { return  ::tanh(a); }
-  static inline __host__ __device__  double abs  (double a) { return   ::abs(a); }
-  static inline __host__ __device__  double round(double a) { return ::round(a); }
-  static inline __host__ __device__  double frac (double a) { return a - ::trunc(a); }
-  static inline __host__ __device__  double cinv (double a) { return 1.0 / a; }
-  static inline __host__ __device__  double add  (double a, double b) { return a + b; }
-  static inline __host__ __device__  double div  (double a, double b) { return a / b; }
-  static inline __host__ __device__  double mul  (double a, double b) { return a * b; }
-  static inline __host__ __device__  double sub  (double a, double b) { return a - b; }
-  static inline __host__ __device__  double pow  (double a, double b) { return ::pow(a, b); }
-};
-
-/// `half` has some type conversion issues associated with it, since it
-/// is a struct without a constructor/implicit conversion constructor.
-/// We use this to convert scalar values to the given type that the
-/// tensor expects.
-template <typename In, typename Out>
-struct ScalarConvert {
-  static __host__ __device__ Out to(const In v) { return (Out) v; }
-};
-
-#ifdef CUDA_HALF_TENSOR
-template <typename Out>
-struct ScalarConvert<half, Out> {
-  static __host__ __device__ Out to(const half v) {
-#ifdef __CUDA_ARCH__
-    return (Out) __half2float(v);
-#else
-    return (Out) THC_half2float(v);
-#endif
-  }
-};
-
-template <typename In>
-struct ScalarConvert<In, half> {
-  static __host__ __device__ half to(const In v) {
-#ifdef __CUDA_ARCH__
-    return __float2half((float) v);
-#else
-    return THC_float2half((float) v);
-#endif
-  }
-};
-
-template <>
-struct ScalarConvert<half, half> {
-  static __host__ __device__ half to(const half v) {
-    return v;
   }
 };
 #endif
 
-#endif // THC_NUMERICS_INC
+#endif // THC_NUMERICS_INC
diff --git a/lib/THC/THCTensorMathPointwise.cuh b/lib/THC/THCTensorMathPointwise.cuh
index c52e0827..f7c6d2af 100644
--- a/lib/THC/THCTensorMathPointwise.cuh
+++ b/lib/THC/THCTensorMathPointwise.cuh
@@ -12,51 +12,24 @@
 template <typename T>
 struct TensorSigmoidOp {
   __device__ __forceinline__ void operator()(T* out, T* in) const {
-    T one = (T) 1.0;
-    *out = one / (one + THCNumerics<T>::exp(- *in));
+    *out = THCNumerics<T>::one / (THCNumerics<T>::one + THCNumerics<T>::exp(- *in));
   }
 
   __device__ __forceinline__ void operator()(T* v) const {
-    T one = (T) 1.0;
-    *v = one / (one + THCNumerics<T>::exp(- *v));
+    *v =  THCNumerics<T>::one / (THCNumerics<T>::one + THCNumerics<T>::exp(- *v));
   }
 };
 
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorSigmoidOp<half> {
-  __device__ __forceinline__ void operator()(half* out, half* in) const {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    half one = ScalarConvert<int, half>::to(1);
-    *out = hdiv(one, __hadd(one, hexp(__hneg(*in))));
-#else
-    float fin = __half2float(*in);
-    *out = __float2half(1.0f / (1.0f + expf(- fin)));
-#endif
-  }
-
-  __device__ __forceinline__ void operator()(half* v) const {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    half one = ScalarConvert<int, half>::to(1);
-    *v = hdiv(one, __hadd(one, hexp(__hneg(*v))));
-#else
-    float fv = __half2float(*v);
-    *v = __float2half(1.0f / (1.0f + expf(- fv)));
-#endif
-  }
-};
-#endif
-
 template <typename T>
 struct TensorSignOp {
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    T orig = *in;
-    *out = (orig > 0) - (orig < 0);
+    const T& orig = *in;
+    *out = (THCNumerics<T>::gt(orig, THCNumerics<T>::zero) ? THCNumerics<T>::one :
+            (orig < 0);
   }
 
   __device__ __forceinline__ void operator()(T* v) {
-    T orig = *v;
-    *v = (orig > 0) - (orig < 0);
+     operator()(v, v);
   }
 };
 

From acaf1a64d4fed30de3fa322926cd1babcfa1cd3b Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Sun, 16 Oct 2016 22:27:04 -0700
Subject: [PATCH 04/13] Checkpoint

---
 lib/THC/THCAtomics.cuh                    |   4 +-
 lib/THC/THCHalf.cu                        |   6 +
 lib/THC/THCHalf.h                         |  40 +-
 lib/THC/THCNumerics.cuh                   | 129 ++++---
 lib/THC/THCTensorMathPairwise.cu          | 215 +----------
 lib/THC/THCTensorMathPointwise.cuh        | 429 +++++-----------------
 lib/THC/THCTensorTypeUtils.cuh            |  51 ---
 lib/THC/generic/THCTensorMathPointwise.cu |   4 +-
 8 files changed, 200 insertions(+), 678 deletions(-)

diff --git a/lib/THC/THCAtomics.cuh b/lib/THC/THCAtomics.cuh
index 42291144..606a776a 100644
--- a/lib/THC/THCAtomics.cuh
+++ b/lib/THC/THCAtomics.cuh
@@ -98,12 +98,12 @@ static inline  __device__ void atomicAdd(half *address, half val) {
       (unsigned int *) ((char *)address - ((size_t)address & 2));
   unsigned int old = *address_as_ui;
   unsigned int assumed;
-
+  typedef THCNumerics<half> N_;
   do {
     assumed = old;
     half hsum;
     hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
-    hsum = THCNumerics<half>::add(hsum, val);
+    hsum = N_::s_(N_::add(hsum, val));
     old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
     old = atomicCAS(address_as_ui, assumed, old);
    } while (assumed != old);
diff --git a/lib/THC/THCHalf.cu b/lib/THC/THCHalf.cu
index 4f317102..cd957605 100644
--- a/lib/THC/THCHalf.cu
+++ b/lib/THC/THCHalf.cu
@@ -1,4 +1,5 @@
 #include "THCHalf.h"
+#include "THCNumerics.cuh"
 #include <thrust/transform.h>
 #include <thrust/execution_policy.h>
 
@@ -29,3 +30,8 @@ void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len) {
 #endif
     in, in + len, out, __half2floatOp());
 }
+
+#if defined (__CUDA_ARCH__) && defined (CUDA_FP16_INSTRINTICS)
+template <> const half THCMathTraitsBase<Half>::one() { return THC_FLOAT_TO_HALF(1.); }
+template <> const half THCMathTraitsBase<Half>::zero(){ return THC_FLOAT_TO_HALF(0.); }
+#endif
diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h
index 11c3bb94..f4c0eb71 100644
--- a/lib/THC/THCHalf.h
+++ b/lib/THC/THCHalf.h
@@ -8,23 +8,6 @@
 #  define CUDA_HALF_TENSOR 1
 # endif
 
-/* This define forces use of 32-bit float math on 16-bit float type 'half' (a.k.a. "pseudo-fp16 mode")
-   even if native harware support is available.
-   This makes difference for Pascal (6.x) cards only: Maxwell (5.x) cards always run 'half' in pseudo mode.
-   !!! Uncomment on your own risk !!!
-   Native fp16 operations may in fact run slower than pseudo-fp16 on your system at the moment
-  (especially if the bulk of your code is in CUDNN and not Cutorch).
-*/
-
-# define FORCE_PSEUDO_FP16 1
-
-# ifndef FORCE_PSEUDO_FP16
-/* Kernel side: Native fp16 ALU instructions are available if we have this: */
-#  if defined(CUDA_HALF_TENSOR) && (CUDA_VERSION >= 8000) && (__CUDA_ARCH__ >= 530)
-#   define CUDA_HALF_INSTRUCTIONS 1
-#  endif
-# endif
-
 # ifdef CUDA_HALF_TENSOR
 
 #  include <cuda_fp16.h>
@@ -43,27 +26,28 @@ THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, long len);
 #  if defined (__CUDA_ARCH__)
 /* use instrintic functons defined for device only in cuda_fp16.h */
 #   define THC_FLOAT_TO_HALF(x) __float2half((float)x)
-#   define THC_HALF_TO_FLOAT(x) __half2float((float)x)
+#   define THC_HALF_TO_FLOAT(x) __half2float(x)
 #  else
 /* use host conversion functions */
 #   define THC_FLOAT_TO_HALF(x) THC_float2half((float)x)
-#   define THC_HALF_TO_FLOAT(x) THC_half2float((float)x)
+#   define THC_HALF_TO_FLOAT(x) THC_half2float(x)
 #  endif
 
-/* Basic wrapper for 'half */
+#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
+# define CUDA_FP16_INSTRINTICS 1
+#endif
+
+/* Basic wrapper for 'native' half */
 struct Half: public half {
+ public:
   const half& val() const { return *this; }
   half& val() { return *this; }
-  const half& operator half() const { return val(); }
   Half(const half& v): half(v) {}
-  Half(const Half& v): half(v.val) {}
+  Half(const Half& v): half(v.val()) {}
 };
 
-
 /* "pseudo-fp16" type: 16-bit storage, float math */
-struct PseudoHalf: public Half {
-  PseudoHalf(const Half& v): Half(v.val) {}
-};
+typedef  half PseudoHalf;
 
 /* Check for native fp16 support on the current device (CC 5.3+) */
 THC_EXTERNC int THC_nativeHalfInstructions(THCState *state);
@@ -83,12 +67,12 @@ __host__ __device__ __forceinline__ bool operator!=(const half& a, const half& b
 
 template <typename In, typename Out>
 struct ScalarConvert {
-  static inline __host__ __device__ const Out& to(const In& v) { return Out(v); }
+  static inline __host__ __device__ Out to(const In& v) { return Out(v); }
 };
 
 template <typename Out>
 struct ScalarConvert<half, Out> {
-  static __host__ __device__ __forceinline__ const Out& to(const half& v) {
+  static __host__ __device__ __forceinline__ Out to(const half& v) {
     return (Out) THC_HALF_TO_FLOAT(v);
   }
 };
diff --git a/lib/THC/THCNumerics.cuh b/lib/THC/THCNumerics.cuh
index 5817d59d..1c1b3cb2 100644
--- a/lib/THC/THCNumerics.cuh
+++ b/lib/THC/THCNumerics.cuh
@@ -5,71 +5,75 @@
 #include <limits>
 #include "THCHalf.h"
 
-using std::numeric_limits;
+
+template <typename T>
+struct CudaNumericLimits
+{
+};
 
 template <typename T, bool is_int>
 struct THCNumericsBase {
 };
 
 template <typename T>
-struct THC_math_traits
+struct THCMathTraitsBase
 {
    typedef T storage_type;
    /* type value should be converted to before doing math on it.
-      For most types, MathType==StorageType.
+      For most types except 16-bit floats, MathType==StorageType.
    */
    typedef T math_type;
    /* type of expression , like (a*b). Usually == MathType */
    typedef T expr_type;
+
+   static const storage_type one();
+   static const storage_type zero();
 };
 
-/* default handling for bare half is pseudo */
-template <>
-struct THC_math_traits<half>
+template <typename T>
+const T THCMathTraitsBase<T>::zero() { return T(0); }
+
+template <typename T>
+const T THCMathTraitsBase<T>::one() { return T(1); }
+
+template <typename T>
+struct THCMathTraits: public THCMathTraitsBase<T>
 {
-   typedef half storage_type;
-   typedef float math_type;
-   typedef float expr_type;
-}
+};
 
+/* default handling for bare half is pseudo */
 template <>
-struct THC_math_traits<PseudoHalf>
+struct THCMathTraits<half>: public THCMathTraitsBase<half>
 {
-   typedef half storage_type;
    typedef float math_type;
-   typedef float expr_type;
-}
+   typedef half expr_type;
+};
 
 template <>
-struct THC_math_traits<Half>
+struct THCMathTraits<Half>: public THCMathTraitsBase<half>
 {
-   typedef half storage_type;
    typedef half math_type;
    typedef half expr_type;
-}
+};
 
-/// Class for numeric limits of the particular data type, which
-/// includes support for `half`.
 template <typename T>
-struct THCNumericsBase<T, true> {
-
-  typedef THC_math_traits<T> traits;
+struct THCNumericsCommonBase {
+  typedef THCMathTraits<T> traits;
   typedef typename traits::storage_type storage_type;
   typedef typename traits::math_type math_type;
   typedef typename traits::expr_type expr_type;
 
-  static const math_type one = 1;
-  static const math_type zero = 0;
-
-  static __host__ __device__ __forceinline__ const math_type& m_(const storage_type& a) {
+  static __host__ __device__ __forceinline__ math_type m_(const storage_type& a) {
     return ScalarConvert<storage_type, math_type>::to(a);
   }
-  static __host__ __device__ __forceinline__ const expr_type& e_(const math_type& a) {
+  static __host__ __device__ __forceinline__ expr_type e_(const math_type& a) {
     return ScalarConvert<math_type, expr_type>::to(a);
   }
-
-  static inline __host__ __device__ expr_type min() { return e_(numeric_limits<T>::min()); }
-  static inline __host__ __device__ expt_type max() { return e_(numeric_limits<T>::max()); }
+  static __host__ __device__ __forceinline__ storage_type s_(const expr_type& a) {
+    return ScalarConvert<expr_type, storage_type>::to(a);
+  }
+  static __host__ __device__ const T min();
+  static __host__ __device__ const T max();
 
   static inline __host__ __device__ bool lt(const storage_type&  a, const storage_type&  b) { return m_(a) < m_(b);  }
   static inline __host__ __device__ bool le(const storage_type&  a, const storage_type&  b) { return m_(a) <= m_(b); }
@@ -78,20 +82,48 @@ struct THCNumericsBase<T, true> {
   static inline __host__ __device__ bool eq(const storage_type&  a, const storage_type&  b) { return m_(a) == m_(b); }
   static inline __host__ __device__ bool ne(const storage_type&  a, const storage_type&  b) { return m_(a) != m_(b); }
 
-  static inline __host__ __device__  const expr_type&  add(const storage_type&  a, const storage_type&  b) { return e_(m_(a) + m_(b)); }
-  static inline __host__ __device__  const expr_type&  mul(const storage_type&  a, const storage_type&  b) { return e_(m_(a) * m_(b)); }
-  static inline __host__ __device__  const expr_type&  sub(const storage_type&  a, const storage_type&  b) { return e_(m_(a) - m_(b)); }
-  static inline __host__ __device__  const expr_type&  div(const storage_type&  a, const storage_type&  b) { return e_(m_(a) / m_(b)); }
-  static inline __host__ __device__  const expr_type&  abs(const storage_type&  a) { return e_(abs(m_(a))); }
+  static inline __host__ __device__  expr_type  add(const storage_type&  a, const storage_type&  b) { return e_(m_(a) + m_(b)); }
+  static inline __host__ __device__  expr_type  mul(const storage_type&  a, const storage_type&  b) { return e_(m_(a) * m_(b)); }
+  static inline __host__ __device__  expr_type  sub(const storage_type&  a, const storage_type&  b) { return e_(m_(a) - m_(b)); }
+  static inline __host__ __device__  expr_type  div(const storage_type&  a, const storage_type&  b) { return e_(m_(a) / m_(b)); }
+  static inline __host__ __device__  expr_type  abs(const storage_type&  a) { return e_(abs(m_(a))); }
+  static inline __host__ __device__  expr_type  neg(const storage_type& a) { return  e_(-m_(a)); }
+  static inline __host__ __device__  expr_type pow (const storage_type& a, T b) { return e_(::pow((double)a, (double)b)); }
+
 };
 
 template <typename T>
-struct THCNumericsBase<T, false>: public THCNumericsBase<T, true> {
+__host__ __device__ const T THCNumericsCommonBase<T>::min() { return std::numeric_limits<T>::min(); }
+
+template <typename T>
+__host__ __device__ const T THCNumericsCommonBase<T>::max() { return std::numeric_limits<T>::max(); }
+
+/* specialized versions */
+template <>
+const half THCNumericsCommonBase<half>::min();
+
+template <>
+const half THCNumericsCommonBase<half>::max();
 
-  static const math_type one = 1.0;
-  static const math_type zero = 0.;
+/// Class for numeric limits of the particular data type, which
+/// includes support for `half`.
+template <typename T>
+struct THCNumericsBase<T, true>  : public THCNumericsCommonBase<T> {
+};
 
-  static inline __host__ __device__  expr_type exp  (const storage_type& a) { return  e_(::exp(m_(a)); }
+template <typename T>
+struct THCNumericsBase<T, false> : public THCNumericsCommonBase<T> {
+  typedef THCNumericsCommonBase<T> Base;
+  using typename Base::traits;
+  using typename Base::math_type;
+  using typename Base::expr_type;
+  using typename Base::storage_type;
+  using Base::e_;
+  using Base::m_;
+  using Base::s_;
+
+
+  static inline __host__ __device__  expr_type exp  (const storage_type& a) { return  e_(::exp(m_(a))); }
   static inline __host__ __device__  expr_type log  (const storage_type& a) { return  e_(::log(m_(a))); }
   static inline __host__ __device__  expr_type log1p(const storage_type& a) { return  e_(::log1p(m_(a))); }
   static inline __host__ __device__  expr_type cos  (const storage_type& a) { return  e_(::cos(m_(a))); }
@@ -101,7 +133,6 @@ struct THCNumericsBase<T, false>: public THCNumericsBase<T, true> {
   static inline __host__ __device__  expr_type ceil (const storage_type& a) { return  e_(::ceil(m_(a))); }
   static inline __host__ __device__  expr_type floor(const storage_type& a) { return  e_(::floor(m_(a))); }
   static inline __host__ __device__  expr_type trunc(const storage_type& a) { return  e_(::trunc(m_(a))); }
-  static inline __host__ __device__  expr_type neg  (const storage_type& a) { return  e_(-m_(a)); }
   static inline __host__ __device__  expr_type acos (const storage_type& a) { return  e_(::acos(m_(a))); }
   static inline __host__ __device__  expr_type cosh (const storage_type& a) { return  e_(::cosh(m_(a))); }
   static inline __host__ __device__  expr_type acosh(const storage_type& a) { return  e_(::acosh(m_(a))); }
@@ -114,29 +145,24 @@ struct THCNumericsBase<T, false>: public THCNumericsBase<T, true> {
   static inline __host__ __device__  expr_type abs  (const storage_type& a) { return  e_(::abs(m_(a))); }
   static inline __host__ __device__  expr_type round(const storage_type& a) { return  e_(::round(m_(a))); }
   static inline __host__ __device__  expr_type frac (const storage_type& a) { return  e_(m_(a) - ::trunc(m_(a))); }
-  static inline __host__ __device__  expr_type cinv (const storage_type& a) { return  e_(one / m_(m_(a)))); }
-  static inline __host__ __device__  expr_type pow  (const storage_type& a, T b) { return e_(::pow(a, b)); }
+  static inline __host__ __device__  expr_type cinv (const storage_type& a) { return  Base::div(Base::traits::one(), a); }
+  static inline __host__ __device__  expr_type pow  (const storage_type& a, T b) { return e_(::pow(m_(a), m_(b))); }
 };
 
 template <typename T>
-struct THCNumerics: public THCNumericsBase<T, numeric_limits<T>::is_integer> {
+struct THCNumerics: public THCNumericsBase<T, std::numeric_limits<T>::is_integer> {
 };
 
-/* do we need this ? */
-template <>
-struct THCNumerics<long> : public THCNumericsBase<long, true> {
-  static inline __host__ __device__  long abs(long a) { return labs(a); }
-};
-
-
 #ifdef CUDA_HALF_TENSOR
 
 template <>
-struct THCNumerics<PseudoHalf>: public THCNumericsBase<PseudoHalf, false>  {
+struct THCNumerics<half>: public THCNumericsBase<half, false>  {
 };
 
+#if defined (__CUDA_ARCH__) && defined (CUDA_FP16_INSTRINTICS)
 template <>
 struct THCNumerics<Half>: public THCNumericsBase<Half, false>  {
+
   static inline __host__ __device__ bool lt(const half& a, const half& b) {
     return __hlt(a, b);
   }
@@ -204,6 +230,7 @@ struct THCNumerics<Half>: public THCNumericsBase<Half, false>  {
     return __hsub(a, b);
   }
 };
+# endif
 #endif
 
 #endif // THC_NUMERICS_INC
diff --git a/lib/THC/THCTensorMathPairwise.cu b/lib/THC/THCTensorMathPairwise.cu
index 2695f2df..bec31035 100644
--- a/lib/THC/THCTensorMathPairwise.cu
+++ b/lib/THC/THCTensorMathPairwise.cu
@@ -7,237 +7,56 @@
 
 template <typename T>
 struct TensorAddConstantOp {
+  typedef THCNumerics<T> N_;
   TensorAddConstantOp(T v) : val(v) {}
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = *in + val;
+    *out = N_::s_(N_::add(*in, val));
   }
-
   __device__ __forceinline__ void operator()(T* v) {
-    *v += val;
-  }
-
-  const T val;
-};
-
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorAddConstantOp<half> {
-#ifdef CUDA_HALF_INSTRUCTIONS
-  TensorAddConstantOp(half v) : val(v) {}
-#else
-  TensorAddConstantOp(half v) : fval(THC_half2float(v)) {}
-#endif
-
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hadd(*in, val);
-#else
-    float fin = __half2float(*in);
-    float fout = fin + fval;
-    *out = __float2half(fout);
-#endif
-  }
-
-  __device__ __forceinline__ void operator()(half* v) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *v = __hadd(*v, val);
-#else
-    float fv = __half2float(*v);
-    fv += fval;
-    *v = __float2half(fv);
-#endif
+        this->operator()(v, v);
   }
-
-#ifdef CUDA_HALF_INSTRUCTIONS
-  const half val;
-#else
-  const float fval;
-#endif
+  const typename N_::storage_type val;
 };
-#endif // CUDA_HALF_TENSOR
-
 
 template <typename T>
 struct TensorSubConstantOp {
+  typedef THCNumerics<T> N_;
   TensorSubConstantOp(T v) : val(v) {}
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = *in - val;
+    *out = N_::s_(N_::sub(*in, val));
   }
-
   __device__ __forceinline__ void operator()(T* v) {
-    *v -= val;
-  }
-
-  const T val;
-};
-
-
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorSubConstantOp<half> {
-#ifdef CUDA_HALF_INSTRUCTIONS
-  TensorSubConstantOp(half v): val(THC_float2half(-(THC_half2float(v)))) {}
-#else
-  TensorSubConstantOp(half v): fval(-(THC_half2float(v))) {}
-#endif
-
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hadd(*in, val);
-#else
-    float fin = __half2float(*in);
-    float fout = fin + fval;
-    *out = __float2half(fout);
-#endif
+        this->operator()(v, v);
   }
-
-  __device__ __forceinline__ void operator()(half* v) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *v = __hadd(*v, val);
-#else
-    float fv = __half2float(*v);
-    fv += fval;
-    *v = __float2half(fv);
-#endif
-  }
-
-#ifdef CUDA_HALF_INSTRUCTIONS
-  const half val;
-#else
-  const float fval;
-#endif
+  const typename N_::storage_type val;
 };
-#endif // CUDA_HALF_TENSOR
 
 
 template <typename T>
 struct TensorMulConstantOp {
   TensorMulConstantOp(T v) : val(v) {}
+  typedef THCNumerics<T> N_;
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = *in * val;
+    *out = N_::s_(N_::mul(*in, val));
   }
-
   __device__ __forceinline__ void operator()(T* v) {
-    *v *= val;
-  }
-
-  const T val;
-};
-
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorMulConstantOp<half> {
-#ifdef CUDA_HALF_INSTRUCTIONS
-  TensorMulConstantOp(half v) : val(v) {}
-#else
-  TensorMulConstantOp(half v) : fval(THC_half2float(v)) {}
-#endif
-
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hmul(*in, val);
-#else
-    float fin = __half2float(*in);
-    float fout = fin * fval;
-    *out = __float2half(fout);
-#endif
-  }
-
-  __device__ __forceinline__ void operator()(half* v) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *v = __hmul(*v, val);
-#else
-    float fv = __half2float(*v);
-    fv *= fval;
-    *v = __float2half(fv);
-#endif
+        this->operator()(v, v);
   }
-
-#ifdef CUDA_HALF_INSTRUCTIONS
-  const half val;
-#else
-  const float fval;
-#endif
+  const typename N_::storage_type val;
 };
-#endif // CUDA_HALF_TENSOR
 
 template <typename T>
 struct TensorDivConstantOp {
-  TensorDivConstantOp(T v) : val(v) {}
+  typedef THCNumerics<T> N_;
+  TensorDivConstantOp(const T& v) : val(v) {}
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = *in / val;
+    *out = N_::s_(N_::div(*in, val));
   }
-
   __device__ __forceinline__ void operator()(T* v) {
-    *v /= val;
-  }
-
-  const T val;
-};
-
-template <>
-struct TensorDivConstantOp<float> {
-  TensorDivConstantOp(float v) : val(1.f / v) {}
-  __device__ __forceinline__ void operator()(float* out, float* in) {
-    *out = *in * val;
-  }
-
-  __device__ __forceinline__ void operator()(float* v) {
-    *v *= val;
-  }
-
-  const float val;
-};
-
-template <>
-struct TensorDivConstantOp<double> {
-  TensorDivConstantOp(double v) : val(1. / v) {}
-  __device__ __forceinline__ void operator()(double* out, double* in) {
-    *out = *in * val;
-  }
-
-  __device__ __forceinline__ void operator()(double* v) {
-    *v *= val;
-  }
-
-  const double val;
-};
-
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorDivConstantOp<half> {
-#ifdef CUDA_HALF_INSTRUCTIONS
-  TensorDivConstantOp(half v) : val(ScalarInv<half>::to(v)) {}
-#else
-  TensorDivConstantOp(half v) : fval(1.f / THC_half2float(v)) {}
-#endif
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hmul(*in, val);
-#else
-    float fin = __half2float(*in);
-    float fout = fin * fval;
-    *out = __float2half(fout);
-#endif
+        this->operator()(v, v);
   }
-
-  __device__ __forceinline__ void operator()(half* v) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *v = __hmul(*v, val);
-#else
-    float fv = __half2float(*v);
-    fv *= fval;
-    *v = __float2half(fv);
-#endif
-  }
-
-#ifdef CUDA_HALF_INSTRUCTIONS
-  const half val;
-#else
-  const float fval;
-#endif
+  const typename N_::storage_type val;
 };
-#endif // CUDA_HALF_TENSOR
 
 template <int Upper>
 struct TensorTriOp {
diff --git a/lib/THC/THCTensorMathPointwise.cuh b/lib/THC/THCTensorMathPointwise.cuh
index f7c6d2af..dc696b18 100644
--- a/lib/THC/THCTensorMathPointwise.cuh
+++ b/lib/THC/THCTensorMathPointwise.cuh
@@ -11,417 +11,163 @@
 
 template <typename T>
 struct TensorSigmoidOp {
+  typedef THCNumerics<T> N_;
+  typedef typename N_::traits traits;
   __device__ __forceinline__ void operator()(T* out, T* in) const {
-    *out = THCNumerics<T>::one / (THCNumerics<T>::one + THCNumerics<T>::exp(- *in));
+    *out = N_::div(traits::one(), N_::add(traits::one(), N_::neg(*in)));
   }
-
   __device__ __forceinline__ void operator()(T* v) const {
-    *v =  THCNumerics<T>::one / (THCNumerics<T>::one + THCNumerics<T>::exp(- *v));
+     this->operator()(v, v);
   }
 };
 
 template <typename T>
 struct TensorSignOp {
+  typedef THCNumerics<T> N_;
+  typedef typename N_::traits traits;
+
   __device__ __forceinline__ void operator()(T* out, T* in) {
     const T& orig = *in;
-    *out = (THCNumerics<T>::gt(orig, THCNumerics<T>::zero) ? THCNumerics<T>::one :
-            (orig < 0);
+    *out = (N_::gt(orig, traits::zero()) ? traits::one() :
+            N_::lt(orig, traits::zero()) ? N_::neg(traits::one()) :
+            traits::zero());
   }
-
   __device__ __forceinline__ void operator()(T* v) {
-     operator()(v, v);
-  }
-};
-
-template <>
-struct TensorSignOp<unsigned char> {
-  __device__ __forceinline__ void operator()(unsigned char* out, unsigned char* in) {
-    unsigned char orig = *in;
-    *out = (orig == 0) ? 0 : 1;
-  }
-
-  __device__ __forceinline__ void operator()(unsigned char* v) {
-    unsigned char orig = *v;
-    *v = (orig == 0) ? 0 : 1;
-  }
-};
-
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorSignOp<half> {
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    half zero = ScalarConvert<int, half>::to(0);
-    half orig = *in;
-    *out = __float2half((float) __hgt(orig, zero) - (float) __hlt(orig, zero));
-#else
-    float orig = __half2float(*in);
-    *out = __float2half((orig > 0) - (orig < 0));
-#endif
-  }
-
-  __device__ __forceinline__ void operator()(half* v) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    half zero = ScalarConvert<int, half>::to(0);
-    half orig = *v;
-    *v = __float2half((float) __hgt(orig, zero) -  (float) __hlt(orig, zero));
-#else
-    float orig = __half2float(*v);
-    *v = __float2half((orig > 0) - (orig < 0));
-#endif
+     this->operator()(v, v);
   }
 };
-#endif
 
 template <typename T>
 struct TensorAddOp {
+  typedef THCNumerics<T> N_;
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = N_::s_(N_::add(*in1, *in2));
+  }
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out += *in;
+    this->operator()(out, in, out);
   }
 
-  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
-    *out = *in1 + *in2;
-  }
 };
 
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorAddOp<half> {
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hadd(*out, *in);
-#else
-    float fout = __half2float(*out);
-    float fin = __half2float(*in);
-    fout += fin;
-    *out = __float2half(fout);
-#endif
-  }
-
-  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hadd(*in1, *in2);
-#else
-    float fin1 = __half2float(*in1);
-    float fin2 = __half2float(*in2);
-    float fout = fin1 + fin2;
-    *out = __float2half(fout);
-#endif
-  }
-};
-#endif // CUDA_HALF_TENSOR
 
 template <typename T>
 struct TensorCAddOp {
+  typedef THCNumerics<T> N_;
   TensorCAddOp(T v) : val(v) {}
-
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out += val * *in;
+    this->operator()(out, out, in);
   }
-
   __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
-    *out = *in1 + val * *in2;
+    *out = N_::s_(N_::add(*in1, N_::mul(val, *in2)));
   }
 
-  T val;
+ const typename N_::storage_type val;
 };
 
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorCAddOp<half> {
-  TensorCAddOp(half v) : val(v) {}
-
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hadd(*out, __hmul(val, *in));
-#else
-    float fout = __half2float(*out);
-    float fval = __half2float(val);
-    float fin = __half2float(*in);
-
-    fout += fval * fin;
-    *out = __float2half(fout);
-#endif
-  }
-
-  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hadd(*in1, __hmul(val, *in2));
-#else
-    float fin1 = __half2float(*in1);
-    float fin2 = __half2float(*in2);
-    float fval = __half2float(val);
-
-    float fout = fin1 + fval * fin2;
-    *out = __float2half(fout);
-#endif
-  }
-
-  half val;
-};
-#endif // CUDA_HALF_TENSOR
 
 template <typename T>
 struct TensorSubOp {
-  __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out -= *in;
-  }
-
+  typedef THCNumerics<T> N_;
   __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
-    *out = *in1 - *in2;
+    *out = N_::s_(N_::sub(*in1, *in2));
   }
-};
-
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorSubOp<half> {
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hsub(*out, *in);
-#else
-    float fout = __half2float(*out);
-    float fin = __half2float(*in);
-    fout -= fin;
-    *out = __float2half(fout);
-#endif
-  }
-
-  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hsub(*in1, *in2);
-#else
-    float fin1 = __half2float(*in1);
-    float fin2 = __half2float(*in2);
-    float fout = fin1 - fin2;
-    *out = __float2half(fout);
-#endif
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    this->operator()(out, out, in);
   }
 };
-#endif // CUDA_HALF_TENSOR
 
 template <typename T>
 struct TensorMulOp {
-  __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out *= *in;
-  }
-
+  typedef THCNumerics<T> N_;
   __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
-    *out = *in1 * *in2;
+    *out = N_::s_(N_::mul(*in1, *in2));
   }
-};
-
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorMulOp<half> {
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hmul(*out, *in);
-#else
-    float fout = __half2float(*out);
-    float fin = __half2float(*in);
-    fout *= fin;
-    *out = __float2half(fout);
-#endif
-  }
-
-  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
-#ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hmul(*in1, *in2);
-#else
-    float fin1 = __half2float(*in1);
-    float fin2 = __half2float(*in2);
-    float fout = fin1 * fin2;
-    *out = __float2half(fout);
-#endif
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    this->operator()(out, out, in);
   }
 };
-#endif // CUDA_HALF_TENSOR
 
 template<typename T>
 struct TensorPowOp {
-  TensorPowOp(T v) : val(v) {}
+  typedef THCNumerics<T> N_;
+  TensorPowOp(T v) : val(N_::s_(v)) {}
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = powf((float) *in, (float) val);
+    *out = N_::s_(N_::pow(*in, val));
   }
 
   __device__ __forceinline__ void operator()(T* v) {
-    *v = powf((float) *v, (float) val);
+    this->operator()(v, v);
   }
-
-  const T val;
+ const typename N_::storage_type val;
 };
 
-template <>
-struct TensorPowOp<double> {
-  TensorPowOp(double v) : val(v) {}
-
-  __device__ __forceinline__ void operator()(double* out, double* in) {
-    *out = pow(*in, val);
-  }
-
-  __device__ __forceinline__ void operator()(double* v) {
-    *v = pow(*v, val);
-  }
-
-  const double val;
-};
-
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorPowOp<half> {
-  TensorPowOp(half v) : val(v) {}
-
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    // No fp16 pow function yet
-    float fin = __half2float(*in);
-    float fval = __half2float(val);
-    float fout = powf(fin, fval);
-    *out = __float2half(fout);
-  }
-
-  __device__ __forceinline__ void operator()(half* v) {
-    // No fp16 pow function yet
-    float fv = __half2float(*v);
-    float fval = __half2float(val);
-    float fout = powf(fv, fval);
-    *v = __float2half(fout);
-  }
-
-  const half val;
-};
-#endif // CUDA_HALF_TENSOR
-
 template <typename T>
 struct TensorCPowOp {
-  __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = powf((float) *out, (float) *in);
-  }
-
+  typedef THCNumerics<T> N_;
   __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
-    *out = powf((float) *in1, (float) *in2);
-  }
-};
-
-template <>
-struct TensorCPowOp<double> {
-  __device__ __forceinline__ void operator()(double* out, double* in) {
-    *out = pow(*out, *in);
+    *out = N_::s_(N_::pow(*in1,*in2));
   }
-
-  __device__ __forceinline__ void operator()(double* out, double* in1, double* in2) {
-    *out = pow(*in1, *in2);
-  }
-};
-
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorCPowOp<half> {
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    // No fp16 pow function yet
-    float fout = __half2float(*out);
-    float fin = __half2float(*in);
-    fout = powf(fout, fin);
-    *out = __float2half(fout);
-  }
-
-  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
-    // No fp16 pow function yet
-    float fin1 = __half2float(*in1);
-    float fin2 = __half2float(*in2);
-    float fout = powf(fin1, fin2);
-    *out = __float2half(fout);
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    this->operator()(out, out, in);
   }
 };
-#endif // CUDA_HALF_TENSOR
 
 template <typename T>
 struct TensorDivOp {
-  __device__ __forceinline__ void
-  operator()(T* out, T* in) {
-    *out /= *in;
-  }
-
-  __device__ __forceinline__ void
-  operator()(T* out, T* in1, T* in2) {
-    *out = *in1 / *in2;
+  typedef THCNumerics<T> N_;
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = N_::s_(N_::div(*in1,*in2));
   }
-};
-
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorDivOp<half> {
-  __device__ __forceinline__ void
-  operator()(half* out, half* in) {
-    // No fp16 div instruction yet
-    float fout = __half2float(*out);
-    float fin = __half2float(*in);
-    fout /= fin;
-    *out = __float2half(fout);
-  }
-
-  __device__ __forceinline__ void
-  operator()(half* out, half* in1, half* in2) {
-    // No fp16 div instruction yet
-    float fin1 = __half2float(*in1);
-    float fin2 = __half2float(*in2);
-    float fout = fin1 / fin2;
-    *out = __float2half(fout);
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    this->operator()(out, out, in);
   }
 };
-#endif // CUDA_HALF_TENSOR
 
 template <typename T>
 struct TensorClampOp {
+  typedef THCNumerics<T> N_;
   TensorClampOp(T min, T max) : minValue(min), maxValue(max) {}
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    T val = THCNumerics<T>::lt(*in, maxValue) ? *in : maxValue;
-    *out = THCNumerics<T>::gt(minValue, val) ? minValue : val;
+    T val = N_::lt(*in, maxValue) ? *in : maxValue;
+    *out = N_::gt(minValue, val) ? minValue : val;
   }
 
   __device__ __forceinline__ void operator()(T* v) {
-    T val = THCNumerics<T>::lt(*v, maxValue) ? *v : maxValue;
-    *v = THCNumerics<T>::gt(minValue, val) ? minValue : val;
+    T val = N_::lt(*v, maxValue) ? *v : maxValue;
+    *v = N_::gt(minValue, val) ? minValue : val;
   }
-
-  const T minValue;
-  const T maxValue;
+  const typename N_::storage_type minValue;
+  const typename N_::storage_type maxValue;
 };
 
 template <typename T>
 struct TensorLerpOp {
+  typedef THCNumerics<T> N_;
   TensorLerpOp(T w) : w(w) {}
-
   __device__ __forceinline__ void operator()(T *out, T *a, T *b) {
-    *out = THCNumerics<T>::add(
-      *a,
-      THCNumerics<T>::mul(
-          w,
-          THCNumerics<T>::sub(*b, *a)
-        )
-    );
+    *out = N_::add(*a, N_::mul(w, N_::sub(*b, *a)));
   }
-
-  const T w;
+  T w;
 };
 
 template <typename T>
 struct TensorCrossOp {
+  typedef THCNumerics<T> N_;
   TensorCrossOp(long sx, long sy, long so) : sx(sx), sy(sy), so(so) {}
-
   __device__ __forceinline__ void operator()(T* out, T* x, T*y) {
-    out[0 * so] = THCNumerics<T>::sub(
-        THCNumerics<T>::mul(x[1 * sx], y[2 * sy]),
-        THCNumerics<T>::mul(x[2 * sx], y[1 * sy])
+    out[0 * so] = N_::sub(
+        N_::mul(x[1 * sx], y[2 * sy]),
+        N_::mul(x[2 * sx], y[1 * sy])
     );
 
-    out[1 * so] = THCNumerics<T>::sub(
-        THCNumerics<T>::mul(x[2 * sx], y[0 * sy]),
-        THCNumerics<T>::mul(x[0 * sx], y[2 * sy])
+    out[1 * so] = N_::sub(
+        N_::mul(x[2 * sx], y[0 * sy]),
+        N_::mul(x[0 * sx], y[2 * sy])
     );
 
-    out[2 * so] = THCNumerics<T>::sub(
-        THCNumerics<T>::mul(x[0 * sx], y[1 * sy]),
-        THCNumerics<T>::mul(x[1 * sx], y[0 * sy])
+    out[2 * so] = N_::sub(
+        N_::mul(x[0 * sx], y[1 * sy]),
+        N_::mul(x[1 * sx], y[0 * sy])
     );
   }
 
@@ -430,36 +176,39 @@ struct TensorCrossOp {
 
 template <typename T>
 struct TensorMaxOp {
+  typedef THCNumerics<T> N_;
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = THCNumerics<T>::gt(*out, *in) ? *out : *in;
+    *out = N_::gt(*out, *in) ? *out : *in;
   }
 
   __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
-    *out = THCNumerics<T>::gt(*in1, *in2) ? *in1 : *in2;
+    *out = N_::gt(*in1, *in2) ? *in1 : *in2;
   }
 };
 
 template <typename T>
 struct TensorMinOp {
+  typedef THCNumerics<T> N_;
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = THCNumerics<T>::lt(*out, *in) ? *out : *in;
+    *out = N_::lt(*out, *in) ? *out : *in;
   }
 
   __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
-    *out = THCNumerics<T>::lt(*in1, *in2) ? *in1 : *in2;
+    *out = N_::lt(*in1, *in2) ? *in1 : *in2;
   }
 };
 
 template <typename T>
 struct TensorMaxValueOp {
+  typedef THCNumerics<T> N_;
   TensorMaxValueOp(T v) : val(v) {}
 
   __device__ __forceinline__ void operator()(T* out) {
-    *out = THCNumerics<T>::gt(*out, val) ? *out : val;
+    *out = N_::gt(*out, val) ? *out : val;
   }
 
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = THCNumerics<T>::gt(*in, val) ? *in : val;
+    *out = N_::gt(*in, val) ? *in : val;
   }
 
   T val;
@@ -467,51 +216,39 @@ struct TensorMaxValueOp {
 
 template <typename T>
 struct TensorMinValueOp {
+  typedef THCNumerics<T> N_;
   TensorMinValueOp(T v) : val(v) {}
-
   __device__ __forceinline__ void operator()(T* out) {
-    *out = THCNumerics<T>::lt(*out, val) ? *out : val;
+    *out = N_::lt(*out, val) ? *out : val;
   }
-
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = THCNumerics<T>::lt(*in, val) ? *in : val;
+    *out = N_::lt(*in, val) ? *in : val;
   }
-
   T val;
 };
 
 template <typename T>
 struct TensorAddCMulOp {
+  typedef THCNumerics<T> N_;
   TensorAddCMulOp(T v) : val(v) {}
-
   __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
-    *out = THCNumerics<T>::add(
-      *out,
-      THCNumerics<T>::mul(
-        val,
-        THCNumerics<T>::mul(*in1, *in2)
-      )
-    );
+    *out = N_::add(*out,N_::mul(val,N_::mul(*in1, *in2)));
   }
-
   T val;
 };
 
 template <typename T>
 struct TensorAddCDivOp {
+  typedef THCNumerics<T> N_;
   TensorAddCDivOp(T v) : val(v) {}
-
   __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
-    *out = THCNumerics<T>::add(
-      *out,
-      THCNumerics<T>::mul(
-        val,
-        THCNumerics<T>::div(*in1, *in2)
-      )
-    );
+    *out = N_::add( *out,
+                    N_::mul(val,
+                            N_::div(*in1, *in2)
+                            )
+                    );
   }
-
-  T val;
+  typename N_::storage_type val;
 };
 
 #endif // THC_TENSORMATH_POINTWISE_CUH
diff --git a/lib/THC/THCTensorTypeUtils.cuh b/lib/THC/THCTensorTypeUtils.cuh
index 4f5d5164..729e22a7 100644
--- a/lib/THC/THCTensorTypeUtils.cuh
+++ b/lib/THC/THCTensorTypeUtils.cuh
@@ -96,55 +96,4 @@ getTensorInfo(THCState* state, TensorType* t) {
     TensorUtils<TensorType>::getData(state, t), dims, sz, st);
 }
 
-template <typename T>
-struct ScalarNegate {
-  static __host__ __device__ T to(const T v) { return -v; }
-};
-
-template <typename T>
-struct ScalarInv {
-  static __host__ __device__ T to(const T v) { return ((T) 1) / v; }
-};
-
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct ScalarNegate<half> {
-  static __host__ __device__ half to(const half v) {
-#ifdef __CUDA_ARCH__
-#ifdef CUDA_HALF_INSTRUCTIONS
-    return __hneg(v);
-#else
-    return __float2half(-__half2float(v));
-#endif
-#else
-    half out = v;
-    out.x ^= 0x8000; // toggle sign bit
-    return out;
-#endif
-  }
-};
-
-template <>
-struct ScalarInv<half> {
-  static __host__ __device__ half to(const half v) {
-#ifdef __CUDA_ARCH__
-    return __float2half(1.0f / __half2float(v));
-#else
-    float fv = THC_half2float(v);
-    fv = 1.0f / fv;
-    return THC_float2half(fv);
-#endif
-  }
-};
-
-inline bool operator==(half a, half b) {
-  return a.x == b.x;
-}
-
-inline bool operator!=(half a, half b) {
-  return a.x != b.x;
-}
-
-#endif // CUDA_HALF_TENSOR
-
 #endif // THC_TENSOR_TYPE_UTILS_INC
diff --git a/lib/THC/generic/THCTensorMathPointwise.cu b/lib/THC/generic/THCTensorMathPointwise.cu
index 26385042..287294db 100644
--- a/lib/THC/generic/THCTensorMathPointwise.cu
+++ b/lib/THC/generic/THCTensorMathPointwise.cu
@@ -245,7 +245,7 @@ THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, real value,
       // self += -value * src2
       if (!THC_pointwiseApply2(state, self_, src2,
                                    TensorCAddOp<real>(
-                                     ScalarNegate<real>::to(value)))) {
+                                     THCNumerics<real>::neg(value)))) {
         THArgCheck(false, 2, CUTORCH_DIM_WARNING);
       }
     }
@@ -261,7 +261,7 @@ THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, real value,
       // self = src1 - value * src2
       if (!THC_pointwiseApply3(state, self_, src1, src2,
                                    TensorCAddOp<real>(
-                                     ScalarNegate<real>::to(value)))) {
+                                     THCNumerics<real>::neg(value)))) {
         THArgCheck(false, 2, CUTORCH_DIM_WARNING);
       }
     }

From 18383174c82fbfd08a9bd8b84639aeec2e4ee22a Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Fri, 28 Oct 2016 17:13:04 -0700
Subject: [PATCH 05/13] checkpoint

---
 CMakeLists.txt                    |  2 +-
 Tensor.lua                        |  9 +---
 generic/CStorage.c                | 24 +--------
 generic/CTensor.c                 | 11 ----
 lib/THC/CMakeLists.txt            |  2 +-
 lib/THC/THCGenerateHalfType.h     |  1 -
 lib/THC/THCHalf.cu                | 87 +++----------------------------
 lib/THC/THCHalf.h                 |  7 +--
 lib/THC/THCStorage.c              |  2 -
 lib/THC/THCStorage.h              |  2 +-
 lib/THC/THCTensorTypeUtils.cuh    |  2 +-
 lib/THC/generic/THCStorageCopy.c  | 75 +++++++++++++-------------
 lib/THC/generic/THCStorageCopy.cu | 24 +++++----
 lib/THC/generic/THCStorageCopy.h  | 10 ++--
 torch/generic/Tensor.c            |  2 +-
 15 files changed, 77 insertions(+), 183 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 96224c44..c3698e68 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,7 +11,7 @@ IF (NOT WIN32)
 SET(CMAKE_C_FLAGS "-std=c99 -Werror=implicit-function-declaration ${CMAKE_C_FLAGS}")
 ENDIF (NOT WIN32)
 IF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
-  SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 ${CMAKE_C_FLAGS}")
+  SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 -DTH_GENERIC_USE_HALF=1 ${CMAKE_C_FLAGS}")
 ENDIF()
 
 INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
diff --git a/Tensor.lua b/Tensor.lua
index 76c87295..0636d682 100644
--- a/Tensor.lua
+++ b/Tensor.lua
@@ -24,6 +24,7 @@ end
 
 local TensorTypes = {
    float  = 'torch.FloatTensor',
+   half   = 'torch.HalfTensor',
    double = 'torch.DoubleTensor',
    byte   = 'torch.ByteTensor',
    char   = 'torch.CharTensor',
@@ -85,11 +86,5 @@ for ValueType, CudaTensorType in pairs(CudaTensorTypes) do
 end
 
 if cutorch.hasHalf then
-   do
-      local function Tensor__totable(self)
-         local host_tensor = self:float()
-         return self:float():totable()
-      end
-      rawset(torch.getmetatable('torch.CudaHalfTensor'), 'totable', Tensor__totable)
-   end
+   CudaTensorTypes.half = 'torch.CudaHalfTensor'
 end
diff --git a/generic/CStorage.c b/generic/CStorage.c
index a6503f28..d6d4db57 100644
--- a/generic/CStorage.c
+++ b/generic/CStorage.c
@@ -6,7 +6,6 @@
 
 /* everything is as the generic Storage.c, except few things (see below) */
 
-#ifndef THC_REAL_IS_HALF
 #define THFile_readRealRaw(file, data, size)                            \
   {                                                                     \
     real *fdata = (real*)THAlloc(sizeof(real)*size);                    \
@@ -22,23 +21,6 @@
     TH_CONCAT_3(THFile_write,Real,Raw)(file, fdata, size);              \
     THFree(fdata);                                                      \
   }
-#else
-#define THFile_readRealRaw(file, data, size)                            \
-  {                                                                     \
-    real *fdata = (real*)THAlloc(sizeof(real)*size);                    \
-    THFile_readCharRaw(file, (char *)fdata, sizeof(real) * size);       \
-    THCudaCheck(cudaMemcpy(data, fdata, size * sizeof(real), cudaMemcpyHostToDevice)); \
-    THFree(fdata);                                                      \
-  }
-
-#define THFile_writeRealRaw(file, data, size)                           \
-  {                                                                     \
-    real *fdata = (real*)THAlloc(sizeof(real)*size);                    \
-    THCudaCheck(cudaMemcpy(fdata, data, size * sizeof(real), cudaMemcpyDeviceToHost)); \
-    THFile_writeCharRaw(file, (char *)fdata, size * sizeof(real));      \
-    THFree(fdata);                                                      \
-  }
-#endif
 
 #define TH_GENERIC_FILE "generic/Storage.c"
 #include "generic/Storage.c"
@@ -87,6 +69,8 @@ static int cutorch_Storage_(copy)(lua_State *L)
     THCStorage_(copyFloat)(state, storage, src);
   else if( (src = luaT_toudata(L, 2, "torch.DoubleStorage")) )
     THCStorage_(copyDouble)(state, storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.HalfStorage")) )
+    THCStorage_(copyHalf)(state, storage, src);
   else
     luaL_typerror(L, 2, "torch.*Storage");
 
@@ -94,7 +78,6 @@ static int cutorch_Storage_(copy)(lua_State *L)
   return 1;
 }
 
-#ifndef THC_REAL_IS_HALF
 static int TH_CONCAT_3(cutorch_,Real,Storage_copy)(lua_State *L)
 {
   THStorage *storage = luaT_checkudata(L, 1, TH_CONCAT_STRING_3(torch.,Real,Storage));
@@ -139,7 +122,6 @@ static int TH_CONCAT_3(cutorch_,Real,Storage_copy)(lua_State *L)
   lua_settop(L, 1);
   return 1;
 }
-#endif
 
 static int cutorch_Storage_(getDevice)(lua_State *L) {
   THCStorage *storage = luaT_checkudata(L, 1, torch_Storage);
@@ -154,12 +136,10 @@ void cutorch_Storage_(init)(lua_State* L)
 
   // torch_Storage macro is defined in Storage.c produce the CudaTensor types
   // so I have to construct the normal torch types by hand
-#ifndef THC_REAL_IS_HALF
   luaT_pushmetatable(L, TH_CONCAT_STRING_3(torch.,Real,Storage));
   lua_pushcfunction(L, TH_CONCAT_3(cutorch_,Real,Storage_copy));
   lua_setfield(L, -2, "copy");
   lua_pop(L, 1);
-#endif
 
   luaT_pushmetatable(L, torch_Storage);
   lua_pushcfunction(L, cutorch_Storage_(copy));
diff --git a/generic/CTensor.c b/generic/CTensor.c
index a9663ff1..64ac76a9 100644
--- a/generic/CTensor.c
+++ b/generic/CTensor.c
@@ -56,7 +56,6 @@ static int cutorch_Tensor_(copy)(lua_State *L)
   return 1;
 }
 
-#ifndef THC_REAL_IS_HALF
 static int cutorch_Tensor_(copyAsyncCPU)(lua_State *L)
 {
 #define STRINGIFY_TENSOR(x) TH_CONCAT_STRING_3(torch.,x,Tensor)
@@ -74,10 +73,7 @@ static int cutorch_Tensor_(copyAsyncCPU)(lua_State *L)
   return 1;
 #undef STRINGIFY_TENSOR
 }
-#endif
-
 
-#ifndef THC_REAL_IS_HALF
 static int TH_CONCAT_3(cutorch_,Real,Tensor_copy)(lua_State *L)
 {
   THTensor *tensor = luaT_checkudata(L, 1, TH_CONCAT_STRING_3(torch.,Real,Tensor));
@@ -122,9 +118,7 @@ static int TH_CONCAT_3(cutorch_,Real,Tensor_copy)(lua_State *L)
   lua_settop(L, 1);
   return 1;
 }
-#endif
 
-#ifndef THC_REAL_IS_HALF
 static int TH_CONCAT_3(cutorch_,Real,Tensor_copyAsyncCuda)(lua_State *L)
 {
 #define STRINGIFY_TENSOR(x) TH_CONCAT_STRING_3(torch.,x,Tensor)
@@ -139,9 +133,6 @@ static int TH_CONCAT_3(cutorch_,Real,Tensor_copyAsyncCuda)(lua_State *L)
   return 1;
 #undef STRINGIFY_TENSOR
 }
-#endif
-
-
 
 #ifdef THC_REAL_IS_FLOAT
 static void THFloatTensor_computesz(THFloatTensor *self, long **sz_, long **st_)
@@ -248,7 +239,6 @@ void cutorch_Tensor_(init)(lua_State* L)
   lua_pop(L, 1);
 #endif
 
-#ifndef THC_REAL_IS_HALF
   luaT_pushmetatable(L, TH_CONCAT_STRING_3(torch.,Real,Tensor));
   lua_pushcfunction(L, TH_CONCAT_3(cutorch_,Real,Tensor_copy));
   lua_setfield(L, -2, "copy");
@@ -264,7 +254,6 @@ void cutorch_Tensor_(init)(lua_State* L)
   lua_pushcfunction(L, cutorch_Tensor_(copyAsyncCPU));
   lua_setfield(L, -2, "copyAsync");
   lua_pop(L, 1);
-#endif
 
   luaT_pushmetatable(L, torch_Tensor);
   lua_pushcfunction(L, cutorch_Tensor_(copy));
diff --git a/lib/THC/CMakeLists.txt b/lib/THC/CMakeLists.txt
index b0813450..c82e4326 100644
--- a/lib/THC/CMakeLists.txt
+++ b/lib/THC/CMakeLists.txt
@@ -174,7 +174,7 @@ IF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
   MESSAGE(STATUS "Found CUDA with FP16 support, compiling with torch.CudaHalfTensor")
   LIST(APPEND src-cuda THCHalf.cu)
   LIST(APPEND CUDA_NVCC_FLAGS "-DCUDA_HAS_FP16=1")
-  SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 ${CMAKE_C_FLAGS}")
+  SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 -DTH_GENERIC_USE_HALF=1 ${CMAKE_C_FLAGS}")
 ELSE(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
   MESSAGE(STATUS "Could not find CUDA with FP16 support, compiling without torch.CudaHalfTensor")
 ENDIF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
diff --git a/lib/THC/THCGenerateHalfType.h b/lib/THC/THCGenerateHalfType.h
index ac592162..501239d7 100644
--- a/lib/THC/THCGenerateHalfType.h
+++ b/lib/THC/THCGenerateHalfType.h
@@ -5,7 +5,6 @@
 #include "THCHalf.h"
 
 #ifdef CUDA_HALF_TENSOR
-
 #define real half
 #define accreal float
 #define Real Half
diff --git a/lib/THC/THCHalf.cu b/lib/THC/THCHalf.cu
index 96397ffb..c574bf8d 100644
--- a/lib/THC/THCHalf.cu
+++ b/lib/THC/THCHalf.cu
@@ -32,92 +32,17 @@ void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len) {
 
 float THC_half2float(half a)
 {
-  unsigned int bits = a.x & 0x7fff;
-  unsigned int sign = a.x & 0x8000;
-  unsigned int exp = a.x & 0x7c00;
-
-  bits <<= 13;
-  sign <<= 16;
-
-  bits += 0x38000000U;
-
-  // flush denormals to 0
-  bits = (exp == 0 ? 0 : bits) | sign;
-
-  union {
-    float f;
-    unsigned int v;
-  } conv;
-  conv.v = bits;
-
-  return conv.f;
+  TH_half h;
+  h.x = a.x;
+  return TH_half2float(h);
 }
 
-/*
-  Copyright (c) 2015, Norbert Juffa
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-  1. Redistributions of source code must retain the above copyright
-     notice, this list of conditions and the following disclaimer.
-
-  2. Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the following disclaimer in the
-     documentation and/or other materials provided with the distribution.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
 half THC_float2half(float a)
 {
-  uint32_t ia;
-  uint16_t ir;
-  memcpy(&ia, &a, sizeof(float));
-
-  ir = (ia >> 16) & 0x8000;
-  if ((ia & 0x7f800000) == 0x7f800000) {
-    if ((ia & 0x7fffffff) == 0x7f800000) {
-      ir |= 0x7c00; /* infinity */
-    } else {
-      ir = 0x7fff; /* canonical NaN */
-    }
-  } else if ((ia & 0x7f800000) >= 0x33000000) {
-    int shift = (int)((ia >> 23) & 0xff) - 127;
-    if (shift > 15) {
-      ir |= 0x7c00; /* infinity */
-    } else {
-      ia = (ia & 0x007fffff) | 0x00800000; /* extract mantissa */
-      if (shift < -14) { /* denormal */
-        ir |= ia >> (-1 - shift);
-        ia = ia << (32 - (-1 - shift));
-      } else { /* normal */
-        ir |= ia >> (24 - 11);
-        ia = ia << (32 - (24 - 11));
-        ir = ir + ((14 + shift) << 10);
-      }
-      /* IEEE-754 round to nearest of even */
-      if ((ia > 0x80000000) || ((ia == 0x80000000) && (ir & 1))) {
-        ir++;
-      }
-    }
-  }
-
   half ret;
-  memcpy(&ret, &ir, sizeof(half));
-  return ret;
+  TH_half th_res = TH_float2half(a);
+  ret.x = th_res.x ;
+  return ret ;
 }
 
 THC_EXTERNC int THC_nativeHalfInstructions(THCState *state) {
diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h
index 7c055e7a..dc77584b 100644
--- a/lib/THC/THCHalf.h
+++ b/lib/THC/THCHalf.h
@@ -8,9 +8,8 @@
 #define CUDA_HALF_TENSOR 1
 #endif
 
-#ifdef CUDA_HALF_TENSOR
+#include "THHalf.h"
 
-#include <cuda_fp16.h>
 #include <stdint.h>
 
 THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len);
@@ -24,6 +23,8 @@ THC_API int THC_nativeHalfInstructions(THCState *state);
 /* Check for performant native fp16 support on the current device */
 THC_API int THC_fastHalfInstructions(THCState *state);
 
-#endif /* CUDA_HALF_TENSOR */
+#  undef  TH_GENERIC_USE_HALF
+#  define TH_GENERIC_USE_HALF 1
+
 
 #endif
diff --git a/lib/THC/THCStorage.c b/lib/THC/THCStorage.c
index 669efa82..6fc9574e 100644
--- a/lib/THC/THCStorage.c
+++ b/lib/THC/THCStorage.c
@@ -2,7 +2,5 @@
 #include "THCGeneral.h"
 #include "THAtomic.h"
 
-#include "THCHalf.h"
-
 #include "generic/THCStorage.c"
 #include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCStorage.h b/lib/THC/THCStorage.h
index ac1cd709..025a3187 100644
--- a/lib/THC/THCStorage.h
+++ b/lib/THC/THCStorage.h
@@ -1,8 +1,8 @@
 #ifndef THC_STORAGE_INC
 #define THC_STORAGE_INC
 
+#include "THCHalf.h"
 #include "THStorage.h"
-#include "THCGeneral.h"
 
 #define THCStorage        TH_CONCAT_3(TH,CReal,Storage)
 #define THCStorage_(NAME) TH_CONCAT_4(TH,CReal,Storage_,NAME)
diff --git a/lib/THC/THCTensorTypeUtils.cuh b/lib/THC/THCTensorTypeUtils.cuh
index 81051f75..25472960 100644
--- a/lib/THC/THCTensorTypeUtils.cuh
+++ b/lib/THC/THCTensorTypeUtils.cuh
@@ -71,7 +71,7 @@ TENSOR_UTILS(THCudaCharTensor, char, long);
 TENSOR_UTILS(THCudaShortTensor, short, long);
 TENSOR_UTILS(THCudaIntTensor, int, long);
 TENSOR_UTILS(THCudaLongTensor, long, long);
-TENSOR_UTILS(THCudaTensor, float, float);
+TENSOR_UTILS(THCudaTensor, float, double);
 TENSOR_UTILS(THCudaDoubleTensor, double, double);
 
 #ifdef CUDA_HALF_TENSOR
diff --git a/lib/THC/generic/THCStorageCopy.c b/lib/THC/generic/THCStorageCopy.c
index af5dbcc2..bf7a628f 100644
--- a/lib/THC/generic/THCStorageCopy.c
+++ b/lib/THC/generic/THCStorageCopy.c
@@ -2,40 +2,36 @@
 #define THC_GENERIC_FILE "generic/THCStorageCopy.c"
 #else
 
-#ifndef THC_REAL_IS_HALF
 void THCStorage_(copyCPU)(THCState *state, THCStorage *self, struct THStorage *src)
 {
   THArgCheck(self->size == src->size, 2, "size does not match");
   THCudaCheck(cudaMemcpy(self->data, src->data, self->size * sizeof(real), cudaMemcpyHostToDevice));
 }
-#endif
 
 #ifndef THC_REAL_IS_HALF
-#define TH_CUDA_STORAGE_IMPLEMENT_COPY(TYPEC)                            \
-  void THCStorage_(copy##TYPEC)(THCState *state, THCStorage *self, struct TH##TYPEC##Storage *src)  \
-  {                                                                      \
-    if(THCTypeIdx_(Real) == THCTypeIdx_(TYPEC)) {  \
-      THCStorage_(copyCPU)(state, self, (THStorage*) src);   /* cast just removes compiler warning */ \
-    } else {                                                             \
-      THStorage *buffer;                                                 \
-      THArgCheck(self->size == src->size, 2, "size does not match");     \
-      buffer = THStorage_(newWithSize)(src->size);                       \
+#define TH_CUDA_STORAGE_IMPLEMENT_COPY_AUX(TYPEC)                        \
+      THStorage *buffer= THStorage_(newWithSize)(src->size);             \
       THStorage_(copy##TYPEC)(buffer, src);                              \
       THCStorage_(copyCPU)(state, self, buffer);                         \
-      THStorage_(free)(buffer);                                          \
-    }                                                                    \
-  }
+      THStorage_(free)(buffer);
 #else
+#define TH_CUDA_STORAGE_IMPLEMENT_COPY_AUX(TYPEC)                        \
+    THCudaStorage *buffer = THCudaStorage_newWithSize(state, src->size); \
+    THCudaStorage_copy##TYPEC(state, buffer, src);                       \
+    THCFloat2Half(state, self->data, buffer->data, src->size);           \
+    THCudaStorage_free(state, buffer);
+#endif
+
 #define TH_CUDA_STORAGE_IMPLEMENT_COPY(TYPEC)                            \
   void THCStorage_(copy##TYPEC)(THCState *state, THCStorage *self, struct TH##TYPEC##Storage *src)  \
   {                                                                      \
+    if(THCTypeIdx_(Real) == THCTypeIdx_(TYPEC)) {                        \
+      THCStorage_(copyCPU)(state, self, (THStorage*) src);   /* cast just removes compiler warning */ \
+    } else {                                                             \
     THArgCheck(self->size == src->size, 2, "size does not match");       \
-    THCudaStorage *buffer = THCudaStorage_newWithSize(state, src->size); \
-    THCudaStorage_copy##TYPEC(state, buffer, src);                       \
-    THCFloat2Half(state, self->data, buffer->data, src->size);           \
-    THCudaStorage_free(state, buffer);                                   \
+    TH_CUDA_STORAGE_IMPLEMENT_COPY_AUX(TYPEC)                            \
+    }                                                                    \
   }
-#endif
 
 TH_CUDA_STORAGE_IMPLEMENT_COPY(Byte)
 TH_CUDA_STORAGE_IMPLEMENT_COPY(Char)
@@ -43,52 +39,57 @@ TH_CUDA_STORAGE_IMPLEMENT_COPY(Short)
 TH_CUDA_STORAGE_IMPLEMENT_COPY(Int)
 TH_CUDA_STORAGE_IMPLEMENT_COPY(Long)
 TH_CUDA_STORAGE_IMPLEMENT_COPY(Float)
+#ifdef CUDA_HALF_TENSOR
+TH_CUDA_STORAGE_IMPLEMENT_COPY(Half)
+#endif
 TH_CUDA_STORAGE_IMPLEMENT_COPY(Double)
 
-#ifndef THC_REAL_IS_HALF
 void THStorage_(copyCuda)(THCState *state, THStorage *self, struct THCStorage *src)
 {
   THArgCheck(self->size == src->size, 2, "size does not match");
   THCudaCheck(cudaMemcpy(self->data, src->data, self->size * sizeof(real), cudaMemcpyDeviceToHost));
 }
-#endif
 
 #ifndef THC_REAL_IS_HALF
-#define TH_CUDA_STORAGE_IMPLEMENT_COPYTO(TYPEC)                         \
-  void TH_CONCAT_4(TH,TYPEC,Storage_copyCuda,Real)(THCState *state, TH##TYPEC##Storage *self, struct THCStorage *src) \
-  {                                                                     \
-    if(THCTypeIdx_(Real) == THCTypeIdx_(TYPEC)) { \
-      THStorage_(copyCuda)(state, (THStorage*) self, src); /* cast just removes compiler warnings */                    \
-    } else {                                                            \
+#define TH_CUDA_STORAGE_IMPLEMENT_COPYTO_AUX(TYPEC)                     \
       THStorage *buffer;                                                \
-      THArgCheck(self->size == src->size, 2, "size does not match");    \
       buffer = THStorage_(newWithSize)(src->size);                      \
       THStorage_(copyCuda)(state, buffer, src);                         \
       TH_CONCAT_4(TH,TYPEC,Storage_copy,Real)(self, buffer);            \
-      THStorage_(free)(buffer);                                         \
-    }                                                                   \
-  }
+      THStorage_(free)(buffer);
 #else
-#define TH_CUDA_STORAGE_IMPLEMENT_COPYTO(TYPEC)                         \
-  void TH_CONCAT_4(TH,TYPEC,Storage_copyCuda,Real)(THCState *state, TH##TYPEC##Storage *self, struct THCStorage *src) \
-  {                                                                     \
-    THArgCheck(self->size == src->size, 2, "size does not match");      \
+#define TH_CUDA_STORAGE_IMPLEMENT_COPYTO_AUX(TYPEC)                     \
     THCudaStorage *buffer = THCudaStorage_newWithSize(state, src->size);\
     THCHalf2Float(state, buffer->data, src->data, src->size);           \
     TH_CONCAT_3(TH,TYPEC,Storage_copyCudaFloat)(state, self, buffer);   \
-    THCudaStorage_free(state, buffer);                                  \
-  }
+    THCudaStorage_free(state, buffer);
 #endif
 
+#define TH_CUDA_STORAGE_IMPLEMENT_COPYTO(TYPEC)                         \
+  void TH_CONCAT_4(TH,TYPEC,Storage_copyCuda,Real)(THCState *state, TH##TYPEC##Storage *self, struct THCStorage *src) \
+  {                                                                     \
+    if(THCTypeIdx_(Real) == THCTypeIdx_(TYPEC)) { \
+      THStorage_(copyCuda)(state, (THStorage*) self, src); /* cast just removes compiler warnings */                    \
+    } else {                                                            \
+      THArgCheck(self->size == src->size, 2, "size does not match");    \
+      TH_CUDA_STORAGE_IMPLEMENT_COPYTO_AUX(TYPEC)                       \
+    }                                                                   \
+  }
+
 TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Byte)
 TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Char)
 TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Short)
 TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Int)
 TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Long)
 TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Float)
+#ifdef CUDA_HALF_TENSOR
+TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Half)
+#endif
 TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Double)
 
 #undef TH_CUDA_STORAGE_IMPLEMENT_COPY
 #undef TH_CUDA_STORAGE_IMPLEMENT_COPYTO
+#undef TH_CUDA_STORAGE_IMPLEMENT_COPY_AUX
+#undef TH_CUDA_STORAGE_IMPLEMENT_COPYTO_AUX
 
 #endif
diff --git a/lib/THC/generic/THCStorageCopy.cu b/lib/THC/generic/THCStorageCopy.cu
index 298f7179..1b224c43 100644
--- a/lib/THC/generic/THCStorageCopy.cu
+++ b/lib/THC/generic/THCStorageCopy.cu
@@ -43,14 +43,18 @@ void THCStorage_(copyCuda)(THCState *state, THCStorage *self, THCStorage *src)
   void THCStorage_(copyCuda##TYPEC)(THCState *state, THCStorage *self, struct THCuda##TYPECUDA##Storage *src)  \
   {                                                                      \
     THArgCheck(self->size == src->size, 2, "size does not match");       \
-    if(THCTypeIdx_(TYPEC) == THCTypeIdxFloat) {                          \
-      THCFloat2Half(state, self->data, (float*) src->data, src->size);   /* cast removes compiler error */     \
-    } else {                                                             \
-      THCudaStorage *buffer = THCudaStorage_newWithSize(state, src->size); \
-      THCudaStorage_copyCuda##TYPEC(state, buffer, src);                 \
-      THCFloat2Half(state, self->data, buffer->data, buffer->size);      \
-      THCudaStorage_free(state, buffer);                                 \
-    }                                                                    \
+    if(THCTypeIdx_(Real) == THCTypeIdx_(TYPEC)) {                        \
+      THCStorage_(copy)(state, self, (THCStorage*) src);   /* cast just removes compiler warning */ \
+    } else {                                                            \
+      if(THCTypeIdx_(TYPEC) == THCTypeIdxFloat) {                       \
+        THCFloat2Half(state, self->data, (float*) src->data, src->size);   /* cast removes compiler error */ \
+      } else {                                                          \
+        THCudaStorage *buffer = THCudaStorage_newWithSize(state, src->size); \
+        THCudaStorage_copyCuda##TYPEC(state, buffer, src);              \
+          THCFloat2Half(state, self->data, buffer->data, buffer->size); \
+          THCudaStorage_free(state, buffer);                            \
+      }                                                                 \
+    }                                                                   \
   }
 #endif
 
@@ -62,7 +66,7 @@ THC_CUDA_STORAGE_IMPLEMENT_COPY(Long,Long)
 THC_CUDA_STORAGE_IMPLEMENT_COPY(Float,)  // i.e. float
 THC_CUDA_STORAGE_IMPLEMENT_COPY(Double,Double)
 
-#ifdef CUDA_HALF_TENSOR
+#if defined (CUDA_HALF_TENSOR)
 #define FLOAT_COPY(TYPE) TH_CONCAT_3(TH, CReal, Storage_copyCudaFloat)
 void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *self, struct THCudaHalfStorage *src)
 {
@@ -76,7 +80,7 @@ void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *self, struct THCudaH
         THCudaStorage_free(state, buffer);
     }
 }
-#undef FLOAT_COPY
+# undef FLOAT_COPY
 #endif // CUDA_HALF_TENSOR
 
 #undef THC_CUDA_STORAGE_IMPLEMENT_COPY
diff --git a/lib/THC/generic/THCStorageCopy.h b/lib/THC/generic/THCStorageCopy.h
index c3e56013..7f1ccb08 100644
--- a/lib/THC/generic/THCStorageCopy.h
+++ b/lib/THC/generic/THCStorageCopy.h
@@ -13,6 +13,7 @@ THC_API void THCStorage_(copyInt)(THCState *state, THCStorage *storage, struct T
 THC_API void THCStorage_(copyLong)(THCState *state, THCStorage *storage, struct THLongStorage *src);
 THC_API void THCStorage_(copyFloat)(THCState *state, THCStorage *storage, struct THFloatStorage *src);
 THC_API void THCStorage_(copyDouble)(THCState *state, THCStorage *storage, struct THDoubleStorage *src);
+THC_API void THCStorage_(copyHalf)(THCState *state, THCStorage *storage, struct THHalfStorage *src);
 
 THC_API void THCStorage_(copyCudaByte)(THCState *state, THCStorage *storage, struct THCudaByteStorage *src);
 THC_API void THCStorage_(copyCudaChar)(THCState *state, THCStorage *storage, struct THCudaCharStorage *src);
@@ -21,7 +22,8 @@ THC_API void THCStorage_(copyCudaInt)(THCState *state, THCStorage *storage, stru
 THC_API void THCStorage_(copyCudaLong)(THCState *state, THCStorage *storage, struct THCudaLongStorage *src);
 THC_API void THCStorage_(copyCudaFloat)(THCState *state, THCStorage *storage, struct THCudaStorage *src);
 THC_API void THCStorage_(copyCudaDouble)(THCState *state, THCStorage *storage, struct THCudaDoubleStorage *src);
-#ifdef CUDA_HALF_TENSOR
+
+#if 0 /* def CUDA_HALF_TENSOR */
 THC_API void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *storage, struct THCudaHalfStorage *src);
 #endif
 
@@ -32,12 +34,12 @@ THC_API void TH_CONCAT_2(THIntStorage_copyCuda   , Real)(THCState *state, THIntS
 THC_API void TH_CONCAT_2(THLongStorage_copyCuda  , Real)(THCState *state, THLongStorage *self, struct THCStorage *src);
 THC_API void TH_CONCAT_2(THFloatStorage_copyCuda , Real)(THCState *state, THFloatStorage *self, struct THCStorage *src);
 THC_API void TH_CONCAT_2(THDoubleStorage_copyCuda, Real)(THCState *state, THDoubleStorage *self, struct THCStorage *src);
+#ifdef CUDA_HALF_TENSOR
+THC_API void TH_CONCAT_2(THHalfStorage_copyCuda, Real)(THCState *state, THHalfStorage *self, struct THCStorage *src);
+#endif
 
-/* There is no THHalfStorage */
-#ifndef THC_REAL_IS_HALF
 THC_API void THStorage_(copyCuda)(THCState *state, THStorage *self, THCStorage *src);
 THC_API void THCStorage_(copyCuda)(THCState *state, THCStorage *self, THCStorage *src);
 THC_API void THCStorage_(copyCPU)(THCState *state, THCStorage *self, THStorage *src);
-#endif
 
 #endif
diff --git a/torch/generic/Tensor.c b/torch/generic/Tensor.c
index d7dcd53b..53b082c0 100644
--- a/torch/generic/Tensor.c
+++ b/torch/generic/Tensor.c
@@ -143,7 +143,7 @@ static int torch_Tensor_(new)(lua_State *L)
           luaL_error(L, "invalid element (not a number)");
         }
 
-#ifdef THC_REAL_IS_HALF
+#ifndef THC_HALF
         half value = THC_float2half((float) lua_tonumber(L, -1));
 #else
         real value = (real) lua_tonumber(L, -1);

From 10ef05657e6f800127797e620c1ad2ced596d33e Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Sat, 29 Oct 2016 15:54:48 -0700
Subject: [PATCH 06/13] Using half from TH

---
 generic/CStorage.c                 |  2 --
 generic/CTensor.c                  |  2 --
 init.c                             |  5 ++---
 lib/THC/THCAtomics.cuh             |  2 +-
 lib/THC/THCBlas.cu                 |  2 --
 lib/THC/THCBlas.h                  |  1 -
 lib/THC/THCGeneral.h.in            | 10 ++++------
 lib/THC/THCGenerateHalfType.h      |  2 +-
 lib/THC/THCHalf.cu                 | 17 +----------------
 lib/THC/THCHalf.h                  | 11 +++++++----
 lib/THC/THCNumerics.cuh            |  3 ++-
 lib/THC/THCStorage.cu              |  2 --
 lib/THC/THCStorage.h               |  3 ++-
 lib/THC/THCStorageCopy.c           |  3 ---
 lib/THC/THCStorageCopy.cu          |  3 +--
 lib/THC/THCStorageCopy.h           |  1 -
 lib/THC/THCTensor.h                |  3 ++-
 lib/THC/THCTensorCopy.c            |  4 +---
 lib/THC/THCTensorCopy.cu           |  1 -
 lib/THC/THCTensorCopy.h            |  1 -
 lib/THC/THCTensorIndex.cu          |  6 +++---
 lib/THC/THCTensorMath.h            |  1 -
 lib/THC/THCTensorMathPairwise.cu   |  3 +--
 lib/THC/THCTensorMathPointwise.cuh |  3 +--
 lib/THC/THCTensorTypeUtils.cu      |  1 -
 lib/THC/THCTensorTypeUtils.cuh     |  1 -
 lib/THC/generic/THCStorageCopy.h   |  2 +-
 lib/THC/generic/THCTensorCopy.h    |  9 ++++++---
 test/test.lua                      |  4 ++--
 torch/generic/Tensor.c             |  4 +---
 torch/utils.h                      |  2 ++
 31 files changed, 41 insertions(+), 73 deletions(-)

diff --git a/generic/CStorage.c b/generic/CStorage.c
index d6d4db57..9be795fd 100644
--- a/generic/CStorage.c
+++ b/generic/CStorage.c
@@ -2,8 +2,6 @@
 #define THC_GENERIC_FILE "generic/CStorage.c"
 #else
 
-#include "THCHalf.h"
-
 /* everything is as the generic Storage.c, except few things (see below) */
 
 #define THFile_readRealRaw(file, data, size)                            \
diff --git a/generic/CTensor.c b/generic/CTensor.c
index 64ac76a9..e22ba59d 100644
--- a/generic/CTensor.c
+++ b/generic/CTensor.c
@@ -2,8 +2,6 @@
 #define THC_GENERIC_FILE "generic/CTensor.c"
 #else
 
-#include "THCHalf.h"
-
 /* everything is as the generic Storage.c, except few things (see below) */
 
 #define TH_GENERIC_FILE "generic/Tensor.c"
diff --git a/init.c b/init.c
index d424aa6e..07ad51be 100644
--- a/init.c
+++ b/init.c
@@ -3,7 +3,6 @@
 #include "THCGeneral.h"
 #include "THCCachingAllocator.h"
 #include "THCTensorRandom.h"
-#include "THCHalf.h" // for CUDA_HALF_TENSOR
 
 extern void cutorch_CudaByteStorage_init(lua_State* L);
 extern void cutorch_CudaCharStorage_init(lua_State* L);
@@ -911,11 +910,11 @@ static int cutorch_hasHalfInstructions(lua_State *L) {
 
 static int cutorch_hasFastHalfInstructions(lua_State *L) {
   THCState *state = cutorch_getstate(L);
-#ifdef CUDA_HALF_TENSOR  
+#ifdef CUDA_HALF_TENSOR
   lua_pushboolean(L, THC_fastHalfInstructions(state));
 #else
   lua_pushboolean(L, 0);
-#endif  
+#endif
   return 1;
 }
 
diff --git a/lib/THC/THCAtomics.cuh b/lib/THC/THCAtomics.cuh
index 42291144..31274e2c 100644
--- a/lib/THC/THCAtomics.cuh
+++ b/lib/THC/THCAtomics.cuh
@@ -1,7 +1,7 @@
 #ifndef THC_ATOMICS_INC
 #define THC_ATOMICS_INC
 
-#include "THCHalf.h"
+#include "THCGeneral.h"
 
 template <typename T, size_t n>
 struct AtomicAddIntegerImpl;
diff --git a/lib/THC/THCBlas.cu b/lib/THC/THCBlas.cu
index e3462025..26ab4398 100644
--- a/lib/THC/THCBlas.cu
+++ b/lib/THC/THCBlas.cu
@@ -1,6 +1,4 @@
 #include "THCBlas.h"
-#include "THCGeneral.h"
-#include "THCHalf.h"
 
 float THCudaBlas_Sdot(THCState *state, long n, float *x, long incx, float *y, long incy)
 {
diff --git a/lib/THC/THCBlas.h b/lib/THC/THCBlas.h
index 45f58eba..5a0c58d6 100644
--- a/lib/THC/THCBlas.h
+++ b/lib/THC/THCBlas.h
@@ -2,7 +2,6 @@
 #define THC_BLAS_INC
 
 #include "THCGeneral.h"
-#include "THCHalf.h"
 
 /* Level 1 */
 THC_API float THCudaBlas_Sdot(THCState *state, long n, float *x, long incx, float *y, long incy);
diff --git a/lib/THC/THCGeneral.h.in b/lib/THC/THCGeneral.h.in
index c50cc1c7..a08bf5a1 100644
--- a/lib/THC/THCGeneral.h.in
+++ b/lib/THC/THCGeneral.h.in
@@ -6,12 +6,6 @@
 #include "THCThreadLocal.h"
 #undef log1p
 
-#include "cuda.h"
-#include "cuda_runtime.h"
-#include "cublas_v2.h"
-
-#cmakedefine USE_MAGMA
-
 #ifdef __cplusplus
 # define THC_EXTERNC extern "C"
 #else
@@ -44,6 +38,10 @@ struct THCRNGState;  /* Random number generator state. */
 typedef struct THCStream THCStream;
 typedef struct THCState THCState;
 
+#include "THCHalf.h"
+
+#cmakedefine USE_MAGMA
+
 typedef struct _THCDeviceAllocator {
    cudaError_t (*malloc)( void*, void**, size_t,         cudaStream_t);
    cudaError_t (*realloc)(void*, void**, size_t, size_t, cudaStream_t);
diff --git a/lib/THC/THCGenerateHalfType.h b/lib/THC/THCGenerateHalfType.h
index 501239d7..6f46e7f0 100644
--- a/lib/THC/THCGenerateHalfType.h
+++ b/lib/THC/THCGenerateHalfType.h
@@ -2,7 +2,7 @@
 #error "You must define THC_GENERIC_FILE before including THGenerateHalfType.h"
 #endif
 
-#include "THCHalf.h"
+#include "THCGeneral.h"
 
 #ifdef CUDA_HALF_TENSOR
 #define real half
diff --git a/lib/THC/THCHalf.cu b/lib/THC/THCHalf.cu
index c574bf8d..bb19d498 100644
--- a/lib/THC/THCHalf.cu
+++ b/lib/THC/THCHalf.cu
@@ -1,4 +1,4 @@
-#include "THCHalf.h"
+#include "THCGeneral.h"
 #include <thrust/transform.h>
 #include <thrust/execution_policy.h>
 
@@ -30,21 +30,6 @@ void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len) {
     in, in + len, out, __half2floatOp());
 }
 
-float THC_half2float(half a)
-{
-  TH_half h;
-  h.x = a.x;
-  return TH_half2float(h);
-}
-
-half THC_float2half(float a)
-{
-  half ret;
-  TH_half th_res = TH_float2half(a);
-  ret.x = th_res.x ;
-  return ret ;
-}
-
 THC_EXTERNC int THC_nativeHalfInstructions(THCState *state) {
   cudaDeviceProp* prop =
     THCState_getCurrentDeviceProperties(state);
diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h
index dc77584b..01b0a692 100644
--- a/lib/THC/THCHalf.h
+++ b/lib/THC/THCHalf.h
@@ -1,11 +1,14 @@
 #ifndef THC_HALF_CONVERSION_INC
 #define THC_HALF_CONVERSION_INC
 
-#include "THCGeneral.h"
+#include "cuda.h"
+#include "cuda_runtime.h"
+#include "cublas_v2.h"
+#include "cuda_fp16.h"
 
 /* We compile with CudaHalfTensor support if we have this: */
 #if CUDA_VERSION >= 7050 || CUDA_HAS_FP16
-#define CUDA_HALF_TENSOR 1
+#  define CUDA_HALF_TENSOR 1
 #endif
 
 #include "THHalf.h"
@@ -14,8 +17,8 @@
 
 THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len);
 THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len);
-THC_API half THC_float2half(float a);
-THC_API float THC_half2float(half a);
+# define THC_float2half(a) TH_float2half(a)
+# define THC_half2float(a) TH_half2float(a)
 
 /* Check for native fp16 support on the current device (CC 5.3+) */
 THC_API int THC_nativeHalfInstructions(THCState *state);
diff --git a/lib/THC/THCNumerics.cuh b/lib/THC/THCNumerics.cuh
index 09443605..4765048a 100644
--- a/lib/THC/THCNumerics.cuh
+++ b/lib/THC/THCNumerics.cuh
@@ -3,7 +3,8 @@
 
 #include <cuda.h>
 #include <limits.h>
-#include "THCHalf.h"
+
+#include "THCGeneral.h"
 
 /// Class for numeric limits of the particular data type, which
 /// includes support for `half`.
diff --git a/lib/THC/THCStorage.cu b/lib/THC/THCStorage.cu
index a23794c2..2ceb0c7f 100644
--- a/lib/THC/THCStorage.cu
+++ b/lib/THC/THCStorage.cu
@@ -6,7 +6,5 @@
 #include <thrust/system/cuda/execution_policy.h>
 #endif
 
-#include "THCHalf.h"
-
 #include "generic/THCStorage.cu"
 #include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCStorage.h b/lib/THC/THCStorage.h
index 025a3187..6a27fa34 100644
--- a/lib/THC/THCStorage.h
+++ b/lib/THC/THCStorage.h
@@ -1,7 +1,8 @@
 #ifndef THC_STORAGE_INC
 #define THC_STORAGE_INC
 
-#include "THCHalf.h"
+#include "THCGeneral.h"
+
 #include "THStorage.h"
 
 #define THCStorage        TH_CONCAT_3(TH,CReal,Storage)
diff --git a/lib/THC/THCStorageCopy.c b/lib/THC/THCStorageCopy.c
index cf2bf8a6..721da7fe 100644
--- a/lib/THC/THCStorageCopy.c
+++ b/lib/THC/THCStorageCopy.c
@@ -1,7 +1,4 @@
 #include "THCStorageCopy.h"
-#include "THCGeneral.h"
-
-#include "THCHalf.h"
 
 #include "generic/THCStorageCopy.c"
 #include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCStorageCopy.cu b/lib/THC/THCStorageCopy.cu
index 18496863..b245d90e 100644
--- a/lib/THC/THCStorageCopy.cu
+++ b/lib/THC/THCStorageCopy.cu
@@ -1,7 +1,6 @@
-#include "THCStorageCopy.h"
 #include "THCGeneral.h"
+#include "THCStorageCopy.h"
 
-#include "THCHalf.h"
 
 #include "generic/THCStorageCopy.cu"
 #include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCStorageCopy.h b/lib/THC/THCStorageCopy.h
index 837056fc..ec8011d3 100644
--- a/lib/THC/THCStorageCopy.h
+++ b/lib/THC/THCStorageCopy.h
@@ -3,7 +3,6 @@
 
 #include "THCStorage.h"
 #include "THCGeneral.h"
-#include "THCHalf.h"
 
 #include "generic/THCStorageCopy.h"
 #include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensor.h b/lib/THC/THCTensor.h
index d4eb49a3..8a3ab9ff 100644
--- a/lib/THC/THCTensor.h
+++ b/lib/THC/THCTensor.h
@@ -1,9 +1,10 @@
 #ifndef THC_TENSOR_INC
 #define THC_TENSOR_INC
 
+#include "THCGeneral.h"
 #include "THTensor.h"
 #include "THCStorage.h"
-#include "THCGeneral.h"
+
 
 #define THCTensor          TH_CONCAT_3(TH,CReal,Tensor)
 #define THCTensor_(NAME)   TH_CONCAT_4(TH,CReal,Tensor_,NAME)
diff --git a/lib/THC/THCTensorCopy.c b/lib/THC/THCTensorCopy.c
index 1bf8980d..9030ab6a 100644
--- a/lib/THC/THCTensorCopy.c
+++ b/lib/THC/THCTensorCopy.c
@@ -1,8 +1,6 @@
-#include "THCTensorCopy.h"
 #include "THCGeneral.h"
 #include "THCTensor.h"
-
-#include "THCHalf.h"
+#include "THCTensorCopy.h"
 
 #include "generic/THCTensorCopy.c"
 #include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorCopy.cu b/lib/THC/THCTensorCopy.cu
index 8889939f..bb6425c5 100644
--- a/lib/THC/THCTensorCopy.cu
+++ b/lib/THC/THCTensorCopy.cu
@@ -1,5 +1,4 @@
 #include "THCApply.cuh"
-#include "THCHalf.h"
 #include "THCNumerics.cuh"
 
 inline int curGPU() {
diff --git a/lib/THC/THCTensorCopy.h b/lib/THC/THCTensorCopy.h
index e8bc4f4b..fc206cb7 100644
--- a/lib/THC/THCTensorCopy.h
+++ b/lib/THC/THCTensorCopy.h
@@ -3,7 +3,6 @@
 
 #include "THCTensor.h"
 #include "THCGeneral.h"
-#include "THCHalf.h"
 
 #include "generic/THCTensorCopy.h"
 #include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorIndex.cu b/lib/THC/THCTensorIndex.cu
index 415e6256..1fe3683f 100644
--- a/lib/THC/THCTensorIndex.cu
+++ b/lib/THC/THCTensorIndex.cu
@@ -1,10 +1,10 @@
-#include "THC.h"
-#include "THCTensorMath.h"
 #include "THCGeneral.h"
+
+#include "THCTensorMath.h"
 #include "THCBlas.h"
 #include "THCTensorCopy.h"
 #include "THCTensorRandom.h"
-#include "THCHalf.h"
+
 #include "THCApply.cuh"
 #include "THCReduce.cuh"
 #include "THCDeviceUtils.cuh"
diff --git a/lib/THC/THCTensorMath.h b/lib/THC/THCTensorMath.h
index 3d714692..fd4d0ae0 100644
--- a/lib/THC/THCTensorMath.h
+++ b/lib/THC/THCTensorMath.h
@@ -2,7 +2,6 @@
 #define TH_CUDA_TENSOR_MATH_INC
 
 #include "THCTensor.h"
-#include "THCGeneral.h"
 
 #include "generic/THCTensorMath.h"
 #include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorMathPairwise.cu b/lib/THC/THCTensorMathPairwise.cu
index 2695f2df..c482ae78 100644
--- a/lib/THC/THCTensorMathPairwise.cu
+++ b/lib/THC/THCTensorMathPairwise.cu
@@ -1,6 +1,5 @@
-#include "THCTensorMath.h"
 #include "THCGeneral.h"
-#include "THCHalf.h"
+#include "THCTensorMath.h"
 #include "THCTensorCopy.h"
 #include "THCApply.cuh"
 #include "THCNumerics.cuh"
diff --git a/lib/THC/THCTensorMathPointwise.cuh b/lib/THC/THCTensorMathPointwise.cuh
index c52e0827..cd9055a7 100644
--- a/lib/THC/THCTensorMathPointwise.cuh
+++ b/lib/THC/THCTensorMathPointwise.cuh
@@ -1,9 +1,8 @@
 #ifndef THC_TENSORMATH_POINTWISE_CUH
 #define THC_TENSORMATH_POINTWISE_CUH
 
-#include "THCTensorMath.h"
 #include "THCGeneral.h"
-#include "THCHalf.h"
+#include "THCTensorMath.h"
 #include "THCTensorCopy.h"
 #include "THCApply.cuh"
 #include "THCNumerics.cuh"
diff --git a/lib/THC/THCTensorTypeUtils.cu b/lib/THC/THCTensorTypeUtils.cu
index a273a728..d3019a07 100644
--- a/lib/THC/THCTensorTypeUtils.cu
+++ b/lib/THC/THCTensorTypeUtils.cu
@@ -1,7 +1,6 @@
 #include "THCTensorTypeUtils.cuh"
 #include "THCTensor.h"
 #include "THCTensorCopy.h"
-#include "THCHalf.h"
 #include <stdlib.h>
 
 namespace {
diff --git a/lib/THC/THCTensorTypeUtils.cuh b/lib/THC/THCTensorTypeUtils.cuh
index 25472960..ce8f21ac 100644
--- a/lib/THC/THCTensorTypeUtils.cuh
+++ b/lib/THC/THCTensorTypeUtils.cuh
@@ -4,7 +4,6 @@
 #include <cuda.h>
 #include <assert.h>
 #include "THCGeneral.h"
-#include "THCHalf.h"
 #include "THCTensor.h"
 #include "THCTensorInfo.cuh"
 
diff --git a/lib/THC/generic/THCStorageCopy.h b/lib/THC/generic/THCStorageCopy.h
index 7f1ccb08..c930a0f5 100644
--- a/lib/THC/generic/THCStorageCopy.h
+++ b/lib/THC/generic/THCStorageCopy.h
@@ -23,7 +23,7 @@ THC_API void THCStorage_(copyCudaLong)(THCState *state, THCStorage *storage, str
 THC_API void THCStorage_(copyCudaFloat)(THCState *state, THCStorage *storage, struct THCudaStorage *src);
 THC_API void THCStorage_(copyCudaDouble)(THCState *state, THCStorage *storage, struct THCudaDoubleStorage *src);
 
-#if 0 /* def CUDA_HALF_TENSOR */
+#ifdef CUDA_HALF_TENSOR
 THC_API void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *storage, struct THCudaHalfStorage *src);
 #endif
 
diff --git a/lib/THC/generic/THCTensorCopy.h b/lib/THC/generic/THCTensorCopy.h
index 71d878d7..0bec3790 100644
--- a/lib/THC/generic/THCTensorCopy.h
+++ b/lib/THC/generic/THCTensorCopy.h
@@ -11,6 +11,9 @@ THC_API void THCTensor_(copyInt)(THCState *state, THCTensor *self, THIntTensor *
 THC_API void THCTensor_(copyLong)(THCState *state, THCTensor *self, THLongTensor *src);
 THC_API void THCTensor_(copyFloat)(THCState *state, THCTensor *self, THFloatTensor *src);
 THC_API void THCTensor_(copyDouble)(THCState *state, THCTensor *self, THDoubleTensor *src);
+#ifdef CUDA_HALF_TENSOR
+THC_API void THCTensor_(copyHalf)(THCState *state, THCTensor *self, struct THHalfTensor *src);
+#endif
 
 THC_API void THCTensor_(copyCudaByte)(THCState *state, THCTensor *dst, struct THCudaByteTensor *src);
 THC_API void THCTensor_(copyCudaChar)(THCState *state, THCTensor *dst, struct THCudaCharTensor *src);
@@ -30,15 +33,15 @@ THC_API void TH_CONCAT_2(THIntTensor_copyCuda   , Real)  (THCState *state, THInt
 THC_API void TH_CONCAT_2(THLongTensor_copyCuda  , Real)  (THCState *state, THLongTensor *self, THCTensor *src);
 THC_API void TH_CONCAT_2(THFloatTensor_copyCuda , Real)  (THCState *state, THFloatTensor *self, THCTensor *src);
 THC_API void TH_CONCAT_2(THDoubleTensor_copyCuda, Real)  (THCState *state, THDoubleTensor *self, THCTensor *src);
+#ifdef CUDA_HALF_TENSOR
+THC_API void TH_CONCAT_2(THHalfTensor_copyCuda, Real)  (THCState *state, THDoubleTensor *self, THCTensor *src);
+#endif
 THC_API void THCTensor_(copyCuda) (THCState *state, THCTensor *self, THCTensor *src);
 
-/* There is no THHalfTensor */
-#ifndef THC_REAL_IS_HALF
 THC_API void THTensor_(copyCuda) (THCState *state, THTensor *self, THCTensor *src);
 THC_API void THCTensor_(copyCPU) (THCState *state, THCTensor *self, THTensor *src);
 
 THC_API void THCTensor_(copyAsyncCPU)(THCState *state, THCTensor *self, THTensor *src);
 THC_API void THTensor_(copyAsyncCuda)(THCState *state, THTensor *self, THCTensor *src);
-#endif
 
 #endif
diff --git a/test/test.lua b/test/test.lua
index 058103d9..6525de95 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -2483,7 +2483,7 @@ function test.logNormal()
    local sz1 = chooseInt(minsize, maxsize)
    local sz2 = chooseInt(minsize, maxsize)
    local mean, std = torch.uniform(), 0.1 * torch.uniform()
-   local tolerance = 0.01
+   local tolerance = 0.02
    local t = torch.CudaTensor(sz1, sz2)
 
    t:logNormal(mean, std)
@@ -3261,7 +3261,7 @@ function test.cat()
 end
 
 function test.catArray()
-   for k, typename in ipairs(typenames) do   
+   for k, typename in ipairs(typenames) do
       for dim = 1, 3 do
 	 local x = torch.Tensor(13, minsize, minsize):uniform()
 	    :type(typename):transpose(1, dim)
diff --git a/torch/generic/Tensor.c b/torch/generic/Tensor.c
index 53b082c0..5e02ab1b 100644
--- a/torch/generic/Tensor.c
+++ b/torch/generic/Tensor.c
@@ -2,8 +2,6 @@
 #define TH_GENERIC_FILE "generic/Tensor.c"
 #else
 
-#include "THCHalf.h"
-
 static void torch_Tensor_(c_readTensorStorageSizeStride)(lua_State *L, int index, int allowNone, int allowTensor, int allowStorage, int allowStride,
                                                          THCStorage **storage_, ptrdiff_t *storageOffset_, THLongStorage **size_, THLongStorage **stride_);
 
@@ -143,7 +141,7 @@ static int torch_Tensor_(new)(lua_State *L)
           luaL_error(L, "invalid element (not a number)");
         }
 
-#ifndef THC_HALF
+#ifdef THC_REAL_IS_HALF
         half value = THC_float2half((float) lua_tonumber(L, -1));
 #else
         real value = (real) lua_tonumber(L, -1);
diff --git a/torch/utils.h b/torch/utils.h
index ae959b73..74d68c66 100644
--- a/torch/utils.h
+++ b/torch/utils.h
@@ -1,6 +1,8 @@
 #ifndef CUTORCH_UTILS_INC
 #define CUTORCH_UTILS_INC
 
+#include "THCGeneral.h"
+
 #include "luaT.h"
 #include "TH.h"
 

From 838ec700c17cb9f2b942c8110184fa1eef4c1301 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Tue, 15 Nov 2016 02:29:53 -0800
Subject: [PATCH 07/13] Implemented cudaMemGetInfo for caching allocator

---
 init.c                          | 31 ++++++++++++++++++---------
 lib/THC/THCCachingAllocator.cpp | 37 +++++++++++++++++++++++++++++----
 lib/THC/THCGeneral.c            | 28 +++++++++++++++++++++++++
 lib/THC/THCGeneral.h.in         |  2 ++
 4 files changed, 84 insertions(+), 14 deletions(-)

diff --git a/init.c b/init.c
index 69f5583a..f7c97d36 100644
--- a/init.c
+++ b/init.c
@@ -694,19 +694,31 @@ static int cutorch_setKernelPeerToPeerAccess(lua_State *L)
 }
 
 static int cutorch_getMemoryUsage(lua_State *L) {
-  size_t freeBytes = 0;
   size_t totalBytes = 0;
-  int curDevice;
-  THCudaCheck(cudaGetDevice(&curDevice));
+  size_t freeBytes = 0;
+
+  THCState *state = cutorch_getstate(L);
 
   int device = luaL_optint(L, 1, -10);
-  if (device == -10) { /* no argument passed, current device mem usage */
-    THCudaCheck(cudaMemGetInfo(&freeBytes, &totalBytes));
-  } else { /* argument was given, particular device's memory usage */
-    THCudaCheck(cudaSetDevice(device-1)); /* zero indexed */
-    THCudaCheck(cudaMemGetInfo(&freeBytes, &totalBytes));
-    THCudaCheck(cudaSetDevice(curDevice));
+  if (device != -10) { /* no argument passed, current device mem usage */
+    --device;
   }
+
+  int prevDevice, curDevice = -10;
+  THCudaCheck(cudaGetDevice(&prevDevice));
+
+  if (device != -10) { /* no argument passed, current device mem usage */
+    curDevice = device; /* zero indexed */
+    if (curDevice != prevDevice)
+      THCudaCheck(cudaSetDevice(curDevice)); 
+  } 
+
+  THCudaCheck(THCudaMemGetInfo(state, &totalBytes, &freeBytes));
+
+  if (curDevice != prevDevice) { /* restore current device if we have changed it */
+    THCudaCheck(cudaSetDevice(prevDevice));
+  }
+
   lua_pushnumber(L, freeBytes);
   lua_pushnumber(L, totalBytes);
   return 2;
@@ -714,7 +726,6 @@ static int cutorch_getMemoryUsage(lua_State *L) {
 
 static int cutorch_setDevice(lua_State *L)
 {
-  THCState *state = cutorch_getstate(L);
   int device = (int)luaL_checknumber(L, 1)-1;
   THCudaCheck(cudaSetDevice(device));
   return 0;
diff --git a/lib/THC/THCCachingAllocator.cpp b/lib/THC/THCCachingAllocator.cpp
index e2fc8d85..ab9528e1 100644
--- a/lib/THC/THCCachingAllocator.cpp
+++ b/lib/THC/THCCachingAllocator.cpp
@@ -158,12 +158,12 @@ struct THCCachingAllocator
     allocated_blocks.erase(it);
 
     bool small = block->size <= kSmallAlloc;
-    auto& free_blocks = small ? large_blocks : small_blocks;
-    try_merge_blocks(block, block->prev, free_blocks);
-    try_merge_blocks(block, block->next, free_blocks);
+    auto& cur_free_blocks = small ? large_blocks : small_blocks;
+    try_merge_blocks(block, block->prev, cur_free_blocks);
+    try_merge_blocks(block, block->next, cur_free_blocks);
 
     block->allocated = false;
-    free_blocks.insert(block);
+    cur_free_blocks.insert(block);
 
     return cudaSuccess;
   }
@@ -205,6 +205,27 @@ struct THCCachingAllocator
     return basePtr;
   }
 
+  // Accumulates sizes of all memory blocks for given device in given free list
+  void cacheInfoAux(FreeBlocks& blocks, int dev_id, size_t* total, size_t* largest)
+  {
+    Block search_key(dev_id, 0, 0);
+    auto it = blocks.lower_bound(&search_key);
+    for (;it != blocks.end() && *it && (*it)->device == dev_id; ++it) {
+      size_t blocksize = (*it)->size;
+      total += blocksize;
+      if (blocksize > *largest)
+	*largest = blocksize;
+    }
+  }
+  
+  void cacheInfo(int dev_id, size_t* total, size_t* largest)
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+    cacheInfoAux(large_blocks, dev_id, total, largest);
+    cacheInfoAux(small_blocks, dev_id, total, largest);
+  }
+
+
   /** combine previously split blocks */
   void try_merge_blocks(Block* dst, Block* src, FreeBlocks& free_blocks)
   {
@@ -327,12 +348,20 @@ static cudaError_t THCCachingAllocator_emptyCache(void* ctx)
   return a->emptyCache();
 }
 
+static cudaError_t THCCachingAllocator_cacheInfo(void* ctx, int dev_id, size_t* totalCached, size_t* largestBlock)
+{
+  THCCachingAllocator* a = (THCCachingAllocator*) ctx;
+  a->cacheInfo(dev_id, totalCached, largestBlock);
+  return cudaSuccess;
+}
+
 static THCCachingAllocator caching_allocator;
 static THCDeviceAllocator device_allocator = {
   &THCCachingAllocator_malloc,
   NULL,
   &THCCachingAllocator_free,
   &THCCachingAllocator_emptyCache,
+  &THCCachingAllocator_cacheInfo,
   &caching_allocator
 };
 
diff --git a/lib/THC/THCGeneral.c b/lib/THC/THCGeneral.c
index 13f62be9..403c4fa6 100644
--- a/lib/THC/THCGeneral.c
+++ b/lib/THC/THCGeneral.c
@@ -41,6 +41,7 @@ static THCDeviceAllocator defaultDeviceAllocator = {
   NULL,
   &cudaFreeWrapper,
   NULL,
+  NULL,
   NULL
 };
 
@@ -710,6 +711,33 @@ cudaError_t THCudaFree(THCState *state, void *ptr)
   return allocator->free(allocator->state, ptr);
 }
 
+cudaError_t THCudaMemGetInfo(THCState *state,  size_t* freeBytes, size_t* totalBytes)
+{
+  size_t cachedBytes = 0;
+  size_t largestBlock = 0;
+  THCDeviceAllocator* allocator = state->cudaDeviceAllocator;
+
+  /* get info from CUDA first */
+  cudaError_t ret = cudaMemGetInfo(freeBytes, totalBytes);
+  if (ret!= cudaSuccess)
+    return ret;
+
+  int device;
+  ret = cudaGetDevice(&device);
+  if (ret!= cudaSuccess)
+    return ret;
+
+  /* not always true - our optimistic guess here */
+  largestBlock = *freeBytes;
+  
+  if (allocator->cacheInfo != NULL)
+    allocator->cacheInfo(allocator->state, device, &cachedBytes, &largestBlock);
+  
+  /* Adjust resulting free bytes number. largesBlock unused for now */
+  *freeBytes += cachedBytes;
+  return cudaSuccess;
+}
+
 static ptrdiff_t applyHeapDelta(THCState *state) {
   ptrdiff_t newHeapSize = THAtomicAddPtrdiff(&heapSize, state->heapDelta) + state->heapDelta;
   state->heapDelta = 0;
diff --git a/lib/THC/THCGeneral.h.in b/lib/THC/THCGeneral.h.in
index 8f55cf3f..c685d373 100644
--- a/lib/THC/THCGeneral.h.in
+++ b/lib/THC/THCGeneral.h.in
@@ -49,6 +49,7 @@ typedef struct _THCDeviceAllocator {
    cudaError_t (*realloc)(void*, void**, size_t, size_t, cudaStream_t);
    cudaError_t (*free)(void*, void*);
    cudaError_t (*emptyCache)(void*);
+   cudaError_t  (*cacheInfo)(void*, int, size_t*, size_t*);
    void* state;
 } THCDeviceAllocator;
 
@@ -177,6 +178,7 @@ THC_API void __THCublasCheck(cublasStatus_t status, const char *file, const int
 
 THC_API cudaError_t THCudaMalloc(THCState *state, void **ptr, size_t size);
 THC_API cudaError_t THCudaFree(THCState *state, void *ptr);
+THC_API cudaError_t THCudaMemGetInfo(THCState *state, size_t* freeBytes, size_t* totalBytes);
 THC_API void THCSetGCHandler(THCState *state,
                              void (*torchGCHandlerFunction)(void *data),
                              void *data );

From 9aef731946f687e08c2ba4438ab985c433e3044f Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Tue, 15 Nov 2016 02:46:49 -0800
Subject: [PATCH 08/13] Added some memory allocations to test_shutdown to test 
 acching allocator

---
 test/test_shutdown.lua | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/test/test_shutdown.lua b/test/test_shutdown.lua
index e78a51e2..a5221642 100644
--- a/test/test_shutdown.lua
+++ b/test/test_shutdown.lua
@@ -1,11 +1,36 @@
 local Threads = require 'threads'
 require 'cutorch'
 
+local function test_cudaEvent()
+   cutorch.reserveStreams(2)
+   cutorch.setStream(1)
+
+   local t1 = torch.CudaTensor(10000000):zero()
+   local t2 = torch.CudaTensor(1):zero()
+
+   local t1View = t1:narrow(1, 10000000, 1)
+   t1:fill(1)
+   print('Memory usage after some allocations [free memory], [total memory]')
+   print(cutorch.getMemoryUsage())
+
+   -- Event is created here
+   local event = cutorch.Event()
+
+   cutorch.setStream(2)
+
+   -- assert below will fail without this
+   event:waitOn()
+   t2:copy(t1View)
+
+   -- revert to default stream
+   cutorch.setStream(0)
+end
+
 print ("cutorch.hasHalf is ", cutorch.hasHalf)
 
 print('Memory usage before intialization of threads [free memory], [total memory]')
 print(cutorch.getMemoryUsage())
-threads = Threads(100, function() require 'cutorch' end)
+threads = Threads(100, function() require 'cutorch'; test_cudaEvent(); end)
 print('Memory usage after intialization of threads [free memory], [total memory]')
 print(cutorch.getMemoryUsage())
 threads:terminate()

From 9a6ba41117663fcd670a2f2b412383dc5d0aa291 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Tue, 15 Nov 2016 14:48:52 -0800
Subject: [PATCH 09/13] Bugfix, test extended

---
 lib/THC/THCCachingAllocator.cpp |  2 +-
 lib/THC/THCGeneral.c            |  2 +-
 test/test_shutdown.lua          | 40 +++++++++++++++++++++++++++------
 3 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/lib/THC/THCCachingAllocator.cpp b/lib/THC/THCCachingAllocator.cpp
index ab9528e1..a4978fb8 100644
--- a/lib/THC/THCCachingAllocator.cpp
+++ b/lib/THC/THCCachingAllocator.cpp
@@ -212,7 +212,7 @@ struct THCCachingAllocator
     auto it = blocks.lower_bound(&search_key);
     for (;it != blocks.end() && *it && (*it)->device == dev_id; ++it) {
       size_t blocksize = (*it)->size;
-      total += blocksize;
+      *total += blocksize;
       if (blocksize > *largest)
 	*largest = blocksize;
     }
diff --git a/lib/THC/THCGeneral.c b/lib/THC/THCGeneral.c
index 403c4fa6..547e060e 100644
--- a/lib/THC/THCGeneral.c
+++ b/lib/THC/THCGeneral.c
@@ -732,7 +732,7 @@ cudaError_t THCudaMemGetInfo(THCState *state,  size_t* freeBytes, size_t* totalB
   
   if (allocator->cacheInfo != NULL)
     allocator->cacheInfo(allocator->state, device, &cachedBytes, &largestBlock);
-  
+
   /* Adjust resulting free bytes number. largesBlock unused for now */
   *freeBytes += cachedBytes;
   return cudaSuccess;
diff --git a/test/test_shutdown.lua b/test/test_shutdown.lua
index a5221642..e48a058d 100644
--- a/test/test_shutdown.lua
+++ b/test/test_shutdown.lua
@@ -10,8 +10,6 @@ local function test_cudaEvent()
 
    local t1View = t1:narrow(1, 10000000, 1)
    t1:fill(1)
-   print('Memory usage after some allocations [free memory], [total memory]')
-   print(cutorch.getMemoryUsage())
 
    -- Event is created here
    local event = cutorch.Event()
@@ -26,13 +24,41 @@ local function test_cudaEvent()
    cutorch.setStream(0)
 end
 
-print ("cutorch.hasHalf is ", cutorch.hasHalf)
+local Gig = 1024*1024*1024
+
+local function test_getMemInfo()
+   local sz = Gig*0.1
+   local t1 = torch.CudaTensor(sz):zero()
+   print('Memory usage after 1st allocation [free memory], [total memory]')
+   local total, free = cutorch.getMemoryUsage()
+   print(free/Gig, total/Gig)
+   local t2 = torch.CudaTensor(sz*1.3):zero()
+   print('Memory usage after 2nd allocation [free memory], [total memory]')
+   local total, free = cutorch.getMemoryUsage()
+   print(free/Gig, total/Gig)
+   t1 = nil
+   collectgarbage()
+   print('Memory usage after 1st deallocation [free memory], [total memory]')
+   local total, free = cutorch.getMemoryUsage()
+   print(free/Gig, total/Gig)
+   t2 = nil
+   collectgarbage()
+   print('Memory usage after 2nd deallocation [free memory], [total memory]')
+   total, free = cutorch.getMemoryUsage()
+   print(free/Gig, total/Gig)
+end
 
+print ("cutorch.hasHalf is ", cutorch.hasHalf)
 print('Memory usage before intialization of threads [free memory], [total memory]')
-print(cutorch.getMemoryUsage())
-threads = Threads(100, function() require 'cutorch'; test_cudaEvent(); end)
+local total, free = cutorch.getMemoryUsage()
+print(free/Gig, total/Gig)
+threads = Threads(20, function() require 'cutorch'; test_getMemInfo(); test_cudaEvent(); end)
 print('Memory usage after intialization of threads [free memory], [total memory]')
-print(cutorch.getMemoryUsage())
+total, free = cutorch.getMemoryUsage()
+print(free/Gig, total/Gig)
 threads:terminate()
+collectgarbage()  
 print('Memory usage after termination of threads [free memory], [total memory]')
-print(cutorch.getMemoryUsage())
+total, free = cutorch.getMemoryUsage()
+print(free/Gig, total/Gig)
+

From 287689458df744e707b321bb72d6ff2635948966 Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Thu, 17 Nov 2016 03:08:21 -0800
Subject: [PATCH 10/13] Added C++ flags

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c3698e68..1ed12563 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,6 +12,7 @@ SET(CMAKE_C_FLAGS "-std=c99 -Werror=implicit-function-declaration ${CMAKE_C_FLAG
 ENDIF (NOT WIN32)
 IF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
   SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 -DTH_GENERIC_USE_HALF=1 ${CMAKE_C_FLAGS}")
+  SET(CMAKE_CXX_FLAGS "-DCUDA_HAS_FP16=1 -DTH_GENERIC_USE_HALF=1 ${CMAKE_CXX_FLAGS}")
 ENDIF()
 
 INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})

From 0114d65f649e0eb528bf45691ddd4476c4a4db8b Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Sun, 15 Jan 2017 02:16:14 -0800
Subject: [PATCH 11/13] Build working

---
 init.c                                    |  27 +---
 lib/THC/CMakeLists.txt                    |   3 +-
 lib/THC/THCAtomics.cuh                    |   6 +-
 lib/THC/THCBlas.cu                        |   2 +
 lib/THC/THCBlas.h                         |   1 +
 lib/THC/THCCachingAllocator.cpp           |   8 +-
 lib/THC/THCGeneral.c                      |   4 -
 lib/THC/THCGeneral.h.in                   |  10 +-
 lib/THC/THCGenerateHalfType.h             |   2 +-
 lib/THC/THCHalf.cu                        |   1 -
 lib/THC/THCHalf.h                         |  30 ++--
 lib/THC/THCNumerics.cuh                   | 178 ++++++++++++----------
 lib/THC/THCStorage.c                      |   2 +
 lib/THC/THCStorage.cu                     |   2 +
 lib/THC/THCStorage.h                      |   3 +-
 lib/THC/THCStorageCopy.c                  |   1 +
 lib/THC/THCStorageCopy.cu                 |   2 +-
 lib/THC/THCStorageCopy.h                  |   1 +
 lib/THC/THCTensor.h                       |   3 +-
 lib/THC/THCTensorCopy.cu                  |   1 +
 lib/THC/THCTensorCopy.h                   |   1 +
 lib/THC/THCTensorIndex.cu                 |   6 +-
 lib/THC/THCTensorMath.h                   |   1 +
 lib/THC/THCTensorMathPairwise.cu          |   4 +-
 lib/THC/THCTensorMathPointwise.cuh        |  47 +-----
 lib/THC/THCTensorTypeUtils.cu             |   1 +
 lib/THC/THCTensorTypeUtils.cuh            |  54 ++++++-
 lib/THC/generic/THCStorageCopy.c          |   4 +-
 lib/THC/generic/THCStorageCopy.h          |   1 -
 lib/THC/generic/THCTensorCopy.h           |   1 -
 lib/THC/generic/THCTensorMathBlas.cu      |   2 +-
 lib/THC/generic/THCTensorMathPairwise.cu  |   2 +-
 lib/THC/generic/THCTensorMathPointwise.cu |   4 +-
 lib/THC/generic/THCTensorMathReduce.cu    |   8 +-
 34 files changed, 215 insertions(+), 208 deletions(-)

diff --git a/init.c b/init.c
index 8f4eb452..1e2b6c49 100644
--- a/init.c
+++ b/init.c
@@ -699,16 +699,8 @@ static int cutorch_setKernelPeerToPeerAccess(lua_State *L)
 }
 
 static int cutorch_getMemoryUsage(lua_State *L) {
-  size_t totalBytes = 0;
-<<<<<<< HEAD
   size_t freeBytes = 0;
-
-  THCState *state = cutorch_getstate(L);
-
-  int device = luaL_optint(L, 1, -10);
-  if (device != -10) { /* no argument passed, current device mem usage */
-    --device;
-=======
+  size_t totalBytes = 0;
   int curDevice;
   THCudaCheck(cudaGetDevice(&curDevice));
   THCState *state = cutorch_getstate(L);
@@ -720,24 +712,7 @@ static int cutorch_getMemoryUsage(lua_State *L) {
     THCudaCheck(cudaSetDevice(device-1)); /* zero indexed */
     THCudaCheck(THCudaMemGetInfo(state, &freeBytes, &totalBytes));
     THCudaCheck(cudaSetDevice(curDevice));
->>>>>>> upstream/master
   }
-
-  int prevDevice, curDevice = -10;
-  THCudaCheck(cudaGetDevice(&prevDevice));
-
-  if (device != -10) { /* no argument passed, current device mem usage */
-    curDevice = device; /* zero indexed */
-    if (curDevice != prevDevice)
-      THCudaCheck(cudaSetDevice(curDevice)); 
-  } 
-
-  THCudaCheck(THCudaMemGetInfo(state, &totalBytes, &freeBytes));
-
-  if (curDevice != prevDevice) { /* restore current device if we have changed it */
-    THCudaCheck(cudaSetDevice(prevDevice));
-  }
-
   lua_pushnumber(L, freeBytes);
   lua_pushnumber(L, totalBytes);
   return 2;
diff --git a/lib/THC/CMakeLists.txt b/lib/THC/CMakeLists.txt
index d5271098..a18451af 100644
--- a/lib/THC/CMakeLists.txt
+++ b/lib/THC/CMakeLists.txt
@@ -190,9 +190,8 @@ MESSAGE(STATUS "got cuda version " ${CUDA_VERSION})
 IF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
   MESSAGE(STATUS "Found CUDA with FP16 support, compiling with torch.CudaHalfTensor")
   LIST(APPEND src-cuda THCHalf.cu)
-  LIST(APPEND src-cuda THCHalf.c)
   LIST(APPEND CUDA_NVCC_FLAGS "-DCUDA_HAS_FP16=1")
-  SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 -DTH_GENERIC_USE_HALF=1 ${CMAKE_C_FLAGS}")
+  SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 ${CMAKE_C_FLAGS}")
 ELSE(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
   MESSAGE(STATUS "Could not find CUDA with FP16 support, compiling without torch.CudaHalfTensor")
 ENDIF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
diff --git a/lib/THC/THCAtomics.cuh b/lib/THC/THCAtomics.cuh
index d03d1740..ac0b45f6 100644
--- a/lib/THC/THCAtomics.cuh
+++ b/lib/THC/THCAtomics.cuh
@@ -1,7 +1,7 @@
 #ifndef THC_ATOMICS_INC
 #define THC_ATOMICS_INC
 
-#include "THCGeneral.h"
+#include "THCHalf.h"
 
 template <typename T, size_t n>
 struct AtomicAddIntegerImpl;
@@ -98,12 +98,12 @@ static inline  __device__ void atomicAdd(half *address, half val) {
       (unsigned int *) ((char *)address - ((size_t)address & 2));
   unsigned int old = *address_as_ui;
   unsigned int assumed;
-  typedef THCNumerics<half> N_;
+
   do {
     assumed = old;
     half hsum;
     hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
-    hsum = N_::s_(N_::add(hsum, val));
+    hsum = THCNumerics<half>::add(hsum, val);
     old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
     old = atomicCAS(address_as_ui, assumed, old);
    } while (assumed != old);
diff --git a/lib/THC/THCBlas.cu b/lib/THC/THCBlas.cu
index 8675945e..c438ad8e 100644
--- a/lib/THC/THCBlas.cu
+++ b/lib/THC/THCBlas.cu
@@ -1,4 +1,6 @@
 #include "THCBlas.h"
+#include "THCGeneral.h"
+#include "THCHalf.h"
 
 float THCudaBlas_Sdot(THCState *state, long n, float *x, long incx, float *y, long incy)
 {
diff --git a/lib/THC/THCBlas.h b/lib/THC/THCBlas.h
index bd9a3807..bf91f936 100644
--- a/lib/THC/THCBlas.h
+++ b/lib/THC/THCBlas.h
@@ -2,6 +2,7 @@
 #define THC_BLAS_INC
 
 #include "THCGeneral.h"
+#include "THCHalf.h"
 
 /* Level 1 */
 THC_API float THCudaBlas_Sdot(THCState *state, long n, float *x, long incx, float *y, long incy);
diff --git a/lib/THC/THCCachingAllocator.cpp b/lib/THC/THCCachingAllocator.cpp
index 8b43ef2b..eeae04a2 100644
--- a/lib/THC/THCCachingAllocator.cpp
+++ b/lib/THC/THCCachingAllocator.cpp
@@ -158,12 +158,12 @@ struct THCCachingAllocator
     allocated_blocks.erase(it);
 
     bool small = block->size <= kSmallAlloc;
-    auto& cur_free_blocks = small ? large_blocks : small_blocks;
-    try_merge_blocks(block, block->prev, cur_free_blocks);
-    try_merge_blocks(block, block->next, cur_free_blocks);
+    auto& free_blocks = small ? large_blocks : small_blocks;
+    try_merge_blocks(block, block->prev, free_blocks);
+    try_merge_blocks(block, block->next, free_blocks);
 
     block->allocated = false;
-    cur_free_blocks.insert(block);
+    free_blocks.insert(block);
 
     return cudaSuccess;
   }
diff --git a/lib/THC/THCGeneral.c b/lib/THC/THCGeneral.c
index fa699aa7..c442bd87 100644
--- a/lib/THC/THCGeneral.c
+++ b/lib/THC/THCGeneral.c
@@ -718,11 +718,7 @@ cudaError_t THCudaMemGetInfo(THCState *state,  size_t* freeBytes, size_t* totalB
 
   /* not always true - our optimistic guess here */
   largestBlock = *freeBytes;
-<<<<<<< HEAD
-  
-=======
 
->>>>>>> upstream/master
   if (allocator->cacheInfo != NULL)
     allocator->cacheInfo(allocator->state, device, &cachedBytes, &largestBlock);
 
diff --git a/lib/THC/THCGeneral.h.in b/lib/THC/THCGeneral.h.in
index c4eeff39..a88bd7db 100644
--- a/lib/THC/THCGeneral.h.in
+++ b/lib/THC/THCGeneral.h.in
@@ -6,6 +6,12 @@
 #include "THCThreadLocal.h"
 #undef log1p
 
+#include "cuda.h"
+#include "cuda_runtime.h"
+#include "cublas_v2.h"
+
+#cmakedefine USE_MAGMA
+
 #ifdef __cplusplus
 # define THC_EXTERNC extern "C"
 #else
@@ -38,10 +44,6 @@ struct THCRNGState;  /* Random number generator state. */
 typedef struct THCStream THCStream;
 typedef struct THCState THCState;
 
-#include "THCHalf.h"
-
-#cmakedefine USE_MAGMA
-
 typedef struct _THCDeviceAllocator {
    cudaError_t (*malloc)( void*, void**, size_t,         cudaStream_t);
    cudaError_t (*realloc)(void*, void**, size_t, size_t, cudaStream_t);
diff --git a/lib/THC/THCGenerateHalfType.h b/lib/THC/THCGenerateHalfType.h
index 596c6b1e..77d4c0ad 100644
--- a/lib/THC/THCGenerateHalfType.h
+++ b/lib/THC/THCGenerateHalfType.h
@@ -2,7 +2,7 @@
 #error "You must define THC_GENERIC_FILE before including THGenerateHalfType.h"
 #endif
 
-#include "THCGeneral.h"
+#include "THCHalf.h"
 
 #if defined(CUDA_HALF_TENSOR) || defined(FORCE_TH_HALF)
 
diff --git a/lib/THC/THCHalf.cu b/lib/THC/THCHalf.cu
index a9b468fe..023774e9 100644
--- a/lib/THC/THCHalf.cu
+++ b/lib/THC/THCHalf.cu
@@ -1,6 +1,5 @@
 #include "THCHalf.h"
 #include "THCThrustAllocator.cuh"
-
 #include <thrust/transform.h>
 #include <thrust/execution_policy.h>
 
diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h
index 6a6d7304..5e35694e 100644
--- a/lib/THC/THCHalf.h
+++ b/lib/THC/THCHalf.h
@@ -13,9 +13,6 @@
 
 #ifdef CUDA_HALF_TENSOR
 
-#  undef  TH_GENERIC_USE_HALF
-#  define TH_GENERIC_USE_HALF 1
-
 #include "THCGeneral.h"
 #include "THHalf.h"
 
@@ -23,8 +20,8 @@
 
 THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len);
 THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len);
-# define THC_float2half(a) TH_float2half(a)
-# define THC_half2float(a) TH_half2float(a)
+THC_API half THC_float2half(float a);
+THC_API float THC_half2float(half a);
 
 /* Check for native fp16 support on the current device (CC 5.3+) */
 THC_API int THC_nativeHalfInstructions(THCState *state);
@@ -33,30 +30,23 @@ THC_API int THC_nativeHalfInstructions(THCState *state);
 THC_API int THC_fastHalfInstructions(THCState *state);
 
  #  if defined (__CUDA_ARCH__)
-
 /* use instrintic functons defined for device only in cuda_fp16.h */
 #   define THC_FLOAT_TO_HALF(x) __float2half((float)x)
 #   define THC_HALF_TO_FLOAT(x) __half2float(x)
+#   define THC_DECL __host__ __device__ __forceinline__
 #  else
 /* use host conversion functions */
 #   define THC_FLOAT_TO_HALF(x) THC_float2half((float)x)
 #   define THC_HALF_TO_FLOAT(x) THC_half2float(x)
+#   define THC_DECL inline
 #  endif
 
-#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-# define CUDA_FP16_INSTRINTICS 1
+#if 0 // __CUDA_ARCH__ == 600 || __CUDA_ARCH__ >= 620
+# define CUDA_HALF_INSTRUCTIONS 1
 #endif
 
 #if defined (__cplusplus__) || defined (__CUDACC__)
 
-__host__ __device__ __forceinline__ bool operator==(const half& a, const half& b) {
-  return a.x == b.x;
-}
-
-__host__ __device__ __forceinline__ bool operator!=(const half& a, const half& b) {
-  return a.x != b.x;
-}
-
 /// `half` has some type conversion issues associated with it, since it
 /// is a struct without a constructor/implicit conversion constructor.
 /// We use this to convert scalar values to the given type that the
@@ -64,26 +54,26 @@ __host__ __device__ __forceinline__ bool operator!=(const half& a, const half& b
 
 template <typename In, typename Out>
 struct ScalarConvert {
-  static inline __host__ __device__ Out to(const In& v) { return Out(v); }
+  static THC_DECL Out to(const In& v) { return Out(v); }
 };
 
 template <typename Out>
 struct ScalarConvert<half, Out> {
-  static __host__ __device__ __forceinline__ Out to(const half& v) {
+  static THC_DECL Out to(const half& v) {
     return (Out) THC_HALF_TO_FLOAT(v);
   }
 };
 
 template <typename In>
 struct ScalarConvert<In, half> {
-  static __host__ __device__ __forceinline__ half to(const In& v) {
+  static THC_DECL half to(const In& v) {
     return THC_FLOAT_TO_HALF(v);
   }
 };
 
 template <>
 struct ScalarConvert<half, half> {
-  static __host__ __device__ __forceinline__ const half& to(const half& v) {
+  static THC_DECL const half& to(const half& v) {
     return v;
   }
 };
diff --git a/lib/THC/THCNumerics.cuh b/lib/THC/THCNumerics.cuh
index 1fab1927..a5d558c1 100644
--- a/lib/THC/THCNumerics.cuh
+++ b/lib/THC/THCNumerics.cuh
@@ -8,20 +8,20 @@
 template <typename T>
 struct THCNumConstants
 {
-   static __host__ __device__ const T one()  { return T(1); }
-   static __host__ __device__ const T zero() { return T(0); }
-   static __host__ __device__ const T min()  { return std::numeric_limits<T>::min(); }
-   static __host__ __device__ const T max()  { return std::numeric_limits<T>::max(); }
+   static THC_DECL const T one()  { return T(1); }
+   static THC_DECL const T zero() { return T(0); }
+   static THC_DECL const T min()  { return std::numeric_limits<T>::min(); }
+   static THC_DECL const T max()  { return std::numeric_limits<T>::max(); }
 
 };
 
 template <>
 struct THCNumConstants<half>
 {
-  static __host__ __device__ const half one()  { half ret = THC_FLOAT_TO_HALF(1.f); return ret;} /* TODO: use literal */
-  static __host__ __device__ const half zero() { half ret; ret.x = 0;  return ret;}
-  static __host__ __device__ const half min() { half ret; ret.x = 0x0400;  return ret; }
-  static __host__ __device__ const half max() { half ret; ret.x = 0x7BFF;  return ret; }
+  static THC_DECL const half one()  { half ret = THC_FLOAT_TO_HALF(1.f); return ret;} /* TODO: use literal */
+  static THC_DECL const half zero() { half ret; ret.x = 0;  return ret;}
+  static THC_DECL const half min() { half ret; ret.x = 0xFBFF;  return ret; }
+  static THC_DECL const half max() { half ret; ret.x = 0x7BFF;  return ret; }
 };
 
 template <typename T, typename M>
@@ -35,29 +35,34 @@ struct THCNumCommonBase {
    /* type of math operation result , like (a*b). Usually == StorageType */
    typedef T expr_type;
 
-  static __host__ __device__ __forceinline__ math_type m_(const storage_type& a) {
+  static THC_DECL  math_type m_(const storage_type& a) {
     return ScalarConvert<storage_type, math_type>::to(a);
   }
-  static __host__ __device__ __forceinline__ expr_type e_(const math_type& a) {
+  static THC_DECL  expr_type e_(const math_type& a) {
     return ScalarConvert<math_type, expr_type>::to(a);
   }
-  static __host__ __device__ __forceinline__ storage_type s_(const expr_type& a) {
+  static THC_DECL  storage_type s_(const expr_type& a) {
     return ScalarConvert<expr_type, storage_type>::to(a);
   }
-  static inline __host__ __device__ bool lt(const storage_type&  a, const storage_type&  b) { return m_(a) < m_(b);  }
-  static inline __host__ __device__ bool le(const storage_type&  a, const storage_type&  b) { return m_(a) <= m_(b); }
-  static inline __host__ __device__ bool gt(const storage_type&  a, const storage_type&  b) { return m_(a) > m_(b);  }
-  static inline __host__ __device__ bool ge(const storage_type&  a, const storage_type&  b) { return m_(a) >= m_(b); }
-  static inline __host__ __device__ bool eq(const storage_type&  a, const storage_type&  b) { return m_(a) == m_(b); }
-  static inline __host__ __device__ bool ne(const storage_type&  a, const storage_type&  b) { return m_(a) != m_(b); }
-
-  static inline __host__ __device__  expr_type  add(const storage_type&  a, const storage_type&  b) { return e_(m_(a) + m_(b)); }
-  static inline __host__ __device__  expr_type  mul(const storage_type&  a, const storage_type&  b) { return e_(m_(a) * m_(b)); }
-  static inline __host__ __device__  expr_type  sub(const storage_type&  a, const storage_type&  b) { return e_(m_(a) - m_(b)); }
-  static inline __host__ __device__  expr_type  div(const storage_type&  a, const storage_type&  b) { return e_(m_(a) / m_(b)); }
-  static inline __host__ __device__  expr_type  abs(const storage_type&  a) { bool isneg = (a<0); return e_(isneg ? -a  : a); }
-  static inline __host__ __device__  expr_type  neg(const storage_type& a) { return  e_(-m_(a)); }
-  static inline __host__ __device__  expr_type pow (const storage_type& a, T b) { return e_(::pow((double)a, (double)b)); }
+
+  static THC_DECL const T min()  { return THCNumConstants<T>::min(); }
+  static THC_DECL const T max()  { return THCNumConstants<T>::max(); }
+
+  static  THC_DECL bool lt(const storage_type&  a, const storage_type&  b) { return m_(a) < m_(b);  }
+  static  THC_DECL bool le(const storage_type&  a, const storage_type&  b) { return m_(a) <= m_(b); }
+  static  THC_DECL bool gt(const storage_type&  a, const storage_type&  b) { return m_(a) > m_(b);  }
+  static  THC_DECL bool ge(const storage_type&  a, const storage_type&  b) { return m_(a) >= m_(b); }
+  static  THC_DECL bool eq(const storage_type&  a, const storage_type&  b) { return m_(a) == m_(b); }
+  static  THC_DECL bool ne(const storage_type&  a, const storage_type&  b) { return m_(a) != m_(b); }
+
+  static  THC_DECL  expr_type  add(const storage_type&  a, const storage_type&  b) { return e_(m_(a) + m_(b)); }
+  static  THC_DECL  expr_type  mul(const storage_type&  a, const storage_type&  b) { return e_(m_(a) * m_(b)); }
+  static  THC_DECL  expr_type  sub(const storage_type&  a, const storage_type&  b) { return e_(m_(a) - m_(b)); }
+  static  THC_DECL  expr_type  div(const storage_type&  a, const storage_type&  b) { return e_(m_(a) / m_(b)); }
+  static  THC_DECL  expr_type  abs(const storage_type&  a) { bool isneg = (a<0); return e_(isneg ? -a  : a); }
+  static  THC_DECL  expr_type  neg(const storage_type& a) { return  e_(-m_(a)); }
+  static  THC_DECL  expr_type  pow (const storage_type& a, T b) { return e_(::pow((double)a, (double)b)); }
+  static  THC_DECL  expr_type  mod(const storage_type&  a, const storage_type&  b) { return e_(m_(a) % m_(b)); }
 
 };
 
@@ -70,7 +75,7 @@ struct THCNumBase<T, M, true>  : public THCNumCommonBase<T, M> {
 
 template <>
 struct THCNumBase<long, long, true>  : public THCNumCommonBase<long, long> {
-  static inline __host__ __device__  expr_type  abs(const storage_type&  a) { return labs(a); }
+  static  THC_DECL  expr_type  abs(const storage_type&  a) { return labs(a); }
 };
 
 template <typename T, typename M>
@@ -83,30 +88,32 @@ struct THCNumBase<T, M, false> : public THCNumCommonBase<T, M> {
   using typename Base::expr_type;
   using typename Base::storage_type;
 
-  static inline __host__ __device__  expr_type exp  (const storage_type& a) { return  e_(::exp(m_(a))); }
-  static inline __host__ __device__  expr_type log  (const storage_type& a) { return  e_(::log(m_(a))); }
-  static inline __host__ __device__  expr_type log1p(const storage_type& a) { return  e_(::log1p(m_(a))); }
-  static inline __host__ __device__  expr_type cos  (const storage_type& a) { return  e_(::cos(m_(a))); }
-  static inline __host__ __device__  expr_type sin  (const storage_type& a) { return  e_(::sin(m_(a))); }
-  static inline __host__ __device__  expr_type sqrt (const storage_type& a) { return  e_(::sqrt(m_(a))); }
-  static inline __host__ __device__  expr_type rsqrt(const storage_type& a) { return  e_(::rsqrt(m_(a))); }
-  static inline __host__ __device__  expr_type ceil (const storage_type& a) { return  e_(::ceil(m_(a))); }
-  static inline __host__ __device__  expr_type floor(const storage_type& a) { return  e_(::floor(m_(a))); }
-  static inline __host__ __device__  expr_type trunc(const storage_type& a) { return  e_(::trunc(m_(a))); }
-  static inline __host__ __device__  expr_type acos (const storage_type& a) { return  e_(::acos(m_(a))); }
-  static inline __host__ __device__  expr_type cosh (const storage_type& a) { return  e_(::cosh(m_(a))); }
-  static inline __host__ __device__  expr_type acosh(const storage_type& a) { return  e_(::acosh(m_(a))); }
-  static inline __host__ __device__  expr_type asin (const storage_type& a) { return  e_(::asin(m_(a))); }
-  static inline __host__ __device__  expr_type sinh (const storage_type& a) { return  e_(::sinh(m_(a))); }
-  static inline __host__ __device__  expr_type asinh(const storage_type& a) { return  e_(::asinh(m_(a))); }
-  static inline __host__ __device__  expr_type tan  (const storage_type& a) { return  e_(::tan(m_(a))); }
-  static inline __host__ __device__  expr_type atan (const storage_type& a) { return  e_(::atan(m_(a))); }
-  static inline __host__ __device__  expr_type tanh (const storage_type& a) { return  e_(::tanh(m_(a))); }
-  static inline __host__ __device__  expr_type abs  (const storage_type& a) { return  e_(::abs(m_(a))); }
-  static inline __host__ __device__  expr_type round(const storage_type& a) { return  e_(::round(m_(a))); }
-  static inline __host__ __device__  expr_type frac (const storage_type& a) { return  e_(m_(a) - ::trunc(m_(a))); }
-  static inline __host__ __device__  expr_type cinv (const storage_type& a) { return  Base::div(THCNumConstants<T>::one(), a); }
-  static inline __host__ __device__  expr_type pow  (const storage_type& a, T b) { return e_(::pow(m_(a), m_(b))); }
+  static  THC_DECL  expr_type exp  (const storage_type& a) { return  e_(::exp(m_(a))); }
+  static  THC_DECL  expr_type log  (const storage_type& a) { return  e_(::log(m_(a))); }
+  static  THC_DECL  expr_type log1p(const storage_type& a) { return  e_(::log1p(m_(a))); }
+  static  THC_DECL  expr_type cos  (const storage_type& a) { return  e_(::cos(m_(a))); }
+  static  THC_DECL  expr_type sin  (const storage_type& a) { return  e_(::sin(m_(a))); }
+  static  THC_DECL  expr_type sqrt (const storage_type& a) { return  e_(::sqrt(m_(a))); }
+  static  THC_DECL  expr_type rsqrt(const storage_type& a) { return  e_(::rsqrt(m_(a))); }
+  static  THC_DECL  expr_type ceil (const storage_type& a) { return  e_(::ceil(m_(a))); }
+  static  THC_DECL  expr_type floor(const storage_type& a) { return  e_(::floor(m_(a))); }
+  static  THC_DECL  expr_type trunc(const storage_type& a) { return  e_(::trunc(m_(a))); }
+  static  THC_DECL  expr_type acos (const storage_type& a) { return  e_(::acos(m_(a))); }
+  static  THC_DECL  expr_type cosh (const storage_type& a) { return  e_(::cosh(m_(a))); }
+  static  THC_DECL  expr_type acosh(const storage_type& a) { return  e_(::acosh(m_(a))); }
+  static  THC_DECL  expr_type asin (const storage_type& a) { return  e_(::asin(m_(a))); }
+  static  THC_DECL  expr_type sinh (const storage_type& a) { return  e_(::sinh(m_(a))); }
+  static  THC_DECL  expr_type asinh(const storage_type& a) { return  e_(::asinh(m_(a))); }
+  static  THC_DECL  expr_type tan  (const storage_type& a) { return  e_(::tan(m_(a))); }
+  static  THC_DECL  expr_type atan (const storage_type& a) { return  e_(::atan(m_(a))); }
+  static  THC_DECL  expr_type tanh (const storage_type& a) { return  e_(::tanh(m_(a))); }
+  static  THC_DECL  expr_type abs  (const storage_type& a) { return  e_(::abs(m_(a))); }
+  static  THC_DECL  expr_type round(const storage_type& a) { return  e_(::round(m_(a))); }
+  static  THC_DECL  expr_type frac (const storage_type& a) { return  e_(m_(a) - ::trunc(m_(a))); }
+  static  THC_DECL  expr_type cinv (const storage_type& a) { return  Base::div(THCNumConstants<T>::one(), a); }
+  static  THC_DECL  expr_type pow  (const storage_type& a, T b) { return e_(::pow(m_(a), m_(b))); }
+  static  THC_DECL  expr_type mod  (const storage_type& a, const storage_type& b) { return  e_(::fmod(m_(a), m_(b))); }
+
 };
 
 template <typename T>
@@ -120,6 +127,7 @@ struct THCNumerics: public THCNumBase<T, T, std::numeric_limits<T>::is_integer>
 
 #ifdef CUDA_HALF_TENSOR
 
+#ifndef CUDA_HALF_INSTRUCTIONS
 template <>
 struct THCNumerics<half>: public THCNumBase<half, float, false>  {
   typedef THCNumCommonBase<half, float> Base;
@@ -132,85 +140,99 @@ struct THCNumerics<half>: public THCNumBase<half, float, false>  {
   typedef THCNumConstants<half> Constants;
 };
 
-
-#if TO_BE_CONTINUED_WITH_EXTRA_TEMPLATE_PARAM
+#else
 template <>
-struct THCNumerics<half,half>: public THCNumBase<half, half, false>  {
+struct THCNumerics<half>: public THCNumBase<half, float, false>  {
   typedef THCNumCommonBase<half, half> Base;
-  using typename Base::math_type;
-  using typename Base::expr_type;
-  using typename Base::storage_type;
-  using Base::e_;
-  using Base::m_;
-  using Base::s_;
   typedef THCNumConstants<half> Constants;
 
-  static inline __host__ __device__ bool lt(const half& a, const half& b) {
+  typedef typename Base::storage_type storage_type;
+  typedef typename Base::math_type    math_type;
+  typedef typename Base::expr_type    expr_type;
+  static THC_DECL  math_type m_(const storage_type& a) {
+    return ScalarConvert<storage_type, math_type>::to(a);
+  }
+  static THC_DECL  expr_type e_(const math_type& a) {
+    return ScalarConvert<math_type, expr_type>::to(a);
+  }
+  static THC_DECL  storage_type s_(const expr_type& a) {
+    return ScalarConvert<expr_type, storage_type>::to(a);
+  }
+
+  static  THC_DECL bool lt(const half& a, const half& b) {
     return __hlt(a, b);
   }
-  static inline __host__ __device__ bool le(const half& a, const half& b) {
+  static  THC_DECL bool le(const half& a, const half& b) {
     return __hle(a, b);
   }
 
-  static inline __host__ __device__ bool gt(const half& a, const half& b) {
+  static  THC_DECL bool gt(const half& a, const half& b) {
     return __hgt(a, b);
   }
 
-  static inline __host__ __device__ bool ge(const half& a, const half& b) {
+  static  THC_DECL bool ge(const half& a, const half& b) {
     return __hge(a, b);
   }
 
-  static inline __host__ __device__ bool eq(const half& a, const half& b) {
+  static  THC_DECL bool eq(const half& a, const half& b) {
     return __heq(a, b);
   }
 
-  static inline __host__ __device__ bool ne(const half& a, const half& b) {
+  static  THC_DECL bool ne(const half& a, const half& b) {
     return __hne(a, b);
   }
-  static inline __host__ __device__ half exp(const half& a) {
+  static  THC_DECL half exp(const half& a) {
     return hexp(a);
   }
-  static inline __host__ __device__ half log(const half& a) {
+  static  THC_DECL half log(const half& a) {
     return hlog(a);
   }
-  static inline __host__ __device__ half cos(const half& a) {
+  static  THC_DECL half cos(const half& a) {
     return hcos(a);
   }
-  static inline __host__ __device__ half sin(const half& a) {
+  static  THC_DECL half sin(const half& a) {
     return hsin(a);
   }
-  static inline __host__ __device__ half sqrt(const half& a) {
+  static  THC_DECL half sqrt(const half& a) {
     return hsqrt(a);
   }
-  static inline __host__ __device__ half rsqrt(const half& a) {
+  static  THC_DECL half rsqrt(const half& a) {
     return hrsqrt(a);
   }
-  static inline __host__ __device__ half ceil(const half& a) {
+  static  THC_DECL half ceil(const half& a) {
     return hceil(a);
   }
 
-  static inline __host__ __device__ half floor(const half& a) {
+  static  THC_DECL half floor(const half& a) {
     return hfloor(a);
   }
 
-  static inline __host__ __device__ half trunc(const half& a) {
+  static  THC_DECL half trunc(const half& a) {
     return htrunc(a);
   }
 
-  static inline __host__ __device__ half neg(const half& a) {
+  static  THC_DECL half neg(const half& a) {
     return __hneg(a);
   }
 
-  static inline __host__ __device__ const half& add(const half& a, const half& b) {
+  static  THC_DECL const half& add(const half& a, const half& b) {
     return __hadd(a, b);
   }
-  static inline __host__ __device__ half mul(const half& a, const half& b) {
+  static  THC_DECL half mul(const half& a, const half& b) {
     return __hmul(a, b);
   }
 
-  static inline __host__ __device__ half sub(const half& a, const half& b) {
+  static  THC_DECL half sub(const half& a, const half& b) {
     return __hsub(a, b);
   }
+
+  static  THC_DECL half div  (const half& a, const half& b) { 
+    return  hdiv(a,b); 
+  }
+  static  THC_DECL half mod  (const half& a, const half& b) { 
+    return __float2half(fmodf(__half2float(a), __half2float(b)));
+  }
+
 };
 # endif
 #endif
diff --git a/lib/THC/THCStorage.c b/lib/THC/THCStorage.c
index 6fc9574e..669efa82 100644
--- a/lib/THC/THCStorage.c
+++ b/lib/THC/THCStorage.c
@@ -2,5 +2,7 @@
 #include "THCGeneral.h"
 #include "THAtomic.h"
 
+#include "THCHalf.h"
+
 #include "generic/THCStorage.c"
 #include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCStorage.cu b/lib/THC/THCStorage.cu
index eba9ef66..5555c6f1 100644
--- a/lib/THC/THCStorage.cu
+++ b/lib/THC/THCStorage.cu
@@ -7,5 +7,7 @@
 #include <thrust/system/cuda/execution_policy.h>
 #endif
 
+#include "THCHalf.h"
+
 #include "generic/THCStorage.cu"
 #include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCStorage.h b/lib/THC/THCStorage.h
index 6a27fa34..ac1cd709 100644
--- a/lib/THC/THCStorage.h
+++ b/lib/THC/THCStorage.h
@@ -1,9 +1,8 @@
 #ifndef THC_STORAGE_INC
 #define THC_STORAGE_INC
 
-#include "THCGeneral.h"
-
 #include "THStorage.h"
+#include "THCGeneral.h"
 
 #define THCStorage        TH_CONCAT_3(TH,CReal,Storage)
 #define THCStorage_(NAME) TH_CONCAT_4(TH,CReal,Storage_,NAME)
diff --git a/lib/THC/THCStorageCopy.c b/lib/THC/THCStorageCopy.c
index 6dcb06c0..ee9bf815 100644
--- a/lib/THC/THCStorageCopy.c
+++ b/lib/THC/THCStorageCopy.c
@@ -1,4 +1,5 @@
 #include "THCStorageCopy.h"
+
 #include "THCTensorCopy.h"
 
 #include "generic/THCStorageCopy.c"
diff --git a/lib/THC/THCStorageCopy.cu b/lib/THC/THCStorageCopy.cu
index 68113dbc..56641888 100644
--- a/lib/THC/THCStorageCopy.cu
+++ b/lib/THC/THCStorageCopy.cu
@@ -1,5 +1,5 @@
-#include "THCGeneral.h"
 #include "THCStorageCopy.h"
+#include "THCGeneral.h"
 
 #include "THCHalf.h"
 #include "THCTensorCopy.h"
diff --git a/lib/THC/THCStorageCopy.h b/lib/THC/THCStorageCopy.h
index ec8011d3..837056fc 100644
--- a/lib/THC/THCStorageCopy.h
+++ b/lib/THC/THCStorageCopy.h
@@ -3,6 +3,7 @@
 
 #include "THCStorage.h"
 #include "THCGeneral.h"
+#include "THCHalf.h"
 
 #include "generic/THCStorageCopy.h"
 #include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensor.h b/lib/THC/THCTensor.h
index 49fbf947..e2521b11 100644
--- a/lib/THC/THCTensor.h
+++ b/lib/THC/THCTensor.h
@@ -1,10 +1,9 @@
 #ifndef THC_TENSOR_INC
 #define THC_TENSOR_INC
 
-#include "THCGeneral.h"
 #include "THTensor.h"
 #include "THCStorage.h"
-
+#include "THCGeneral.h"
 
 #define THCTensor          TH_CONCAT_3(TH,CReal,Tensor)
 #define THCTensor_(NAME)   TH_CONCAT_4(TH,CReal,Tensor_,NAME)
diff --git a/lib/THC/THCTensorCopy.cu b/lib/THC/THCTensorCopy.cu
index 2ae3cfe9..e6b35677 100644
--- a/lib/THC/THCTensorCopy.cu
+++ b/lib/THC/THCTensorCopy.cu
@@ -1,4 +1,5 @@
 #include "THCApply.cuh"
+#include "THCHalf.h"
 #include "THCNumerics.cuh"
 
 inline int curGPU() {
diff --git a/lib/THC/THCTensorCopy.h b/lib/THC/THCTensorCopy.h
index fc206cb7..e8bc4f4b 100644
--- a/lib/THC/THCTensorCopy.h
+++ b/lib/THC/THCTensorCopy.h
@@ -3,6 +3,7 @@
 
 #include "THCTensor.h"
 #include "THCGeneral.h"
+#include "THCHalf.h"
 
 #include "generic/THCTensorCopy.h"
 #include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorIndex.cu b/lib/THC/THCTensorIndex.cu
index 1fe3683f..415e6256 100644
--- a/lib/THC/THCTensorIndex.cu
+++ b/lib/THC/THCTensorIndex.cu
@@ -1,10 +1,10 @@
-#include "THCGeneral.h"
-
+#include "THC.h"
 #include "THCTensorMath.h"
+#include "THCGeneral.h"
 #include "THCBlas.h"
 #include "THCTensorCopy.h"
 #include "THCTensorRandom.h"
-
+#include "THCHalf.h"
 #include "THCApply.cuh"
 #include "THCReduce.cuh"
 #include "THCDeviceUtils.cuh"
diff --git a/lib/THC/THCTensorMath.h b/lib/THC/THCTensorMath.h
index e074a3a7..19ae679c 100644
--- a/lib/THC/THCTensorMath.h
+++ b/lib/THC/THCTensorMath.h
@@ -2,6 +2,7 @@
 #define TH_CUDA_TENSOR_MATH_INC
 
 #include "THCTensor.h"
+#include "THCGeneral.h"
 
 #include "generic/THCTensorMath.h"
 #include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorMathPairwise.cu b/lib/THC/THCTensorMathPairwise.cu
index 90b6fb3b..1511fb88 100644
--- a/lib/THC/THCTensorMathPairwise.cu
+++ b/lib/THC/THCTensorMathPairwise.cu
@@ -111,7 +111,7 @@ struct TensorRemainderOp<half> {
 
   __device__ __forceinline__ void operator()(half* out, half* in) {
 #ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hsub(*in,  __hmul(val, hfloor(__hdiv(*in,  val))));
+    *out = __hsub(*in,  __hmul(val, hfloor(hdiv(*in,  val))));
 #else
     float fin = __half2float(*in);
     float fout = fin - fval * floorf(fin / fval);
@@ -121,7 +121,7 @@ struct TensorRemainderOp<half> {
 
   __device__ __forceinline__ void operator()(half* v) {
 #ifdef CUDA_HALF_INSTRUCTIONS
-    *v = __hsub(*v, __hmul(val, hfloor(__hdiv(*v, val))));
+    *v = __hsub(*v, __hmul(val, hfloor(hdiv(*v, val))));
 #else
     float fv = __half2float(*v);
     fv = fv - fval * floorf(fv / fval);
diff --git a/lib/THC/THCTensorMathPointwise.cuh b/lib/THC/THCTensorMathPointwise.cuh
index 87ab8b41..f9499c5e 100644
--- a/lib/THC/THCTensorMathPointwise.cuh
+++ b/lib/THC/THCTensorMathPointwise.cuh
@@ -175,7 +175,7 @@ template <>
 struct TensorCRemainderOp<half> {
   __device__ __forceinline__ void operator()(half* out, half* in) {
 #ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hsub(*out, __hmul(*in, hfloor(__hdiv(*out, *in))));
+    *out = __hsub(*out, __hmul(*in, hfloor(hdiv(*out, *in))));
 #else
     float fout = __half2float(*out);
     float fin = __half2float(*in);
@@ -185,7 +185,7 @@ struct TensorCRemainderOp<half> {
 
   __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
 #ifdef CUDA_HALF_INSTRUCTIONS
-    *out = __hsub(*in1, __hmul(*in2, hfloor(__hdiv(*in1, *in2))));
+    *out = __hsub(*in1, __hmul(*in2, hfloor(hdiv(*in1, *in2))));
 #else
     float fin1 = __half2float(*in1);
     float fin2 = __half2float(*in2);
@@ -197,49 +197,14 @@ struct TensorCRemainderOp<half> {
 
 template <typename T>
 struct TensorCFmodOp {
-  __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = *out % *in;
-  }
-
+  typedef THCNumerics<T> N_;
   __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
-    *out = *in1 % *in2;
-  }
-};
-
-template <>
-struct TensorCFmodOp<float> {
-  __device__ __forceinline__ void operator()(float* out, float* in) {
-    *out = fmodf(*out, *in);
-  }
-
-  __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) {
-    *out = fmodf(*in1, *in2);
+    *out = N_::s_(N_::mod(*in1,*in2));
   }
-};
-
-template <>
-struct TensorCFmodOp<double> {
-  __device__ __forceinline__ void operator()(double* out, double* in) {
-    *out = fmod(*out, *in);
-  }
-
-  __device__ __forceinline__ void operator()(double* out, double* in1, double* in2) {
-    *out = fmod(*in1, *in2);
-  }
-};
-
-#ifdef CUDA_HALF_TENSOR
-template <>
-struct TensorCFmodOp<half> {
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    *out = __float2half(fmodf(__half2float(*out), __half2float(*in)));
-  }
-
-  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
-    *out = __float2half(fmodf(__half2float(*in1), __half2float(*in2)));
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    this->operator()(out, out, in);
   }
 };
-#endif // CUDA_HALF_TENSOR
 
 template <typename T>
 struct TensorClampOp {
diff --git a/lib/THC/THCTensorTypeUtils.cu b/lib/THC/THCTensorTypeUtils.cu
index 43a2f4bb..bdcbcbec 100644
--- a/lib/THC/THCTensorTypeUtils.cu
+++ b/lib/THC/THCTensorTypeUtils.cu
@@ -1,6 +1,7 @@
 #include "THCTensorTypeUtils.cuh"
 #include "THCTensor.h"
 #include "THCTensorCopy.h"
+#include "THCHalf.h"
 #include <stdlib.h>
 
 namespace {
diff --git a/lib/THC/THCTensorTypeUtils.cuh b/lib/THC/THCTensorTypeUtils.cuh
index 7c32eeca..273606e5 100644
--- a/lib/THC/THCTensorTypeUtils.cuh
+++ b/lib/THC/THCTensorTypeUtils.cuh
@@ -4,6 +4,7 @@
 #include <cuda.h>
 #include <assert.h>
 #include "THCGeneral.h"
+#include "THCHalf.h"
 #include "THCTensor.h"
 #include "THCTensorInfo.cuh"
 
@@ -74,7 +75,7 @@ TENSOR_UTILS(THCudaCharTensor, char, long);
 TENSOR_UTILS(THCudaShortTensor, short, long);
 TENSOR_UTILS(THCudaIntTensor, int, long);
 TENSOR_UTILS(THCudaLongTensor, long, long);
-TENSOR_UTILS(THCudaTensor, float, double);
+TENSOR_UTILS(THCudaTensor, float, float);
 TENSOR_UTILS(THCudaDoubleTensor, double, double);
 
 #ifdef CUDA_HALF_TENSOR
@@ -125,4 +126,55 @@ getTensorInfo(THCState* state, TensorType* t) {
     TensorUtils<TensorType>::getData(state, t), dims, sz, st);
 }
 
+template <typename T>
+struct ScalarNegate {
+  static __host__ __device__ T to(const T v) { return -v; }
+};
+
+template <typename T>
+struct ScalarInv {
+  static __host__ __device__ T to(const T v) { return ((T) 1) / v; }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct ScalarNegate<half> {
+  static __host__ __device__ half to(const half v) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hneg(v);
+#else
+    return __float2half(-__half2float(v));
+#endif
+#else
+    half out = v;
+    out.x ^= 0x8000; // toggle sign bit
+    return out;
+#endif
+  }
+};
+
+template <>
+struct ScalarInv<half> {
+  static __host__ __device__ half to(const half v) {
+#ifdef __CUDA_ARCH__
+    return __float2half(1.0f / __half2float(v));
+#else
+    float fv = THC_half2float(v);
+    fv = 1.0f / fv;
+    return THC_float2half(fv);
+#endif
+  }
+};
+
+inline bool operator==(half a, half b) {
+  return a.x == b.x;
+}
+
+inline bool operator!=(half a, half b) {
+  return a.x != b.x;
+}
+
+#endif // CUDA_HALF_TENSOR
+
 #endif // THC_TENSOR_TYPE_UTILS_INC
diff --git a/lib/THC/generic/THCStorageCopy.c b/lib/THC/generic/THCStorageCopy.c
index 517df27c..13069069 100644
--- a/lib/THC/generic/THCStorageCopy.c
+++ b/lib/THC/generic/THCStorageCopy.c
@@ -25,6 +25,7 @@ TH_CUDA_STORAGE_IMPLEMENT_COPY(Short)
 TH_CUDA_STORAGE_IMPLEMENT_COPY(Int)
 TH_CUDA_STORAGE_IMPLEMENT_COPY(Long)
 TH_CUDA_STORAGE_IMPLEMENT_COPY(Float)
+TH_CUDA_STORAGE_IMPLEMENT_COPY(Half)
 TH_CUDA_STORAGE_IMPLEMENT_COPY(Double)
 
 void THStorage_(copyCuda)(THCState *state, THStorage *self, struct THCStorage *src)
@@ -44,7 +45,6 @@ void TH_CONCAT_4(TH,TYPEC,Storage_copyCuda,Real)(THCState *state, TH##TYPEC##Sto
   THCTensor_(free)(state, srcTensor);                                       \
   TH##TYPEC##Tensor_free(selfTensor);                                   \
 }
-
 TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Byte)
 TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Char)
 TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Short)
@@ -56,7 +56,5 @@ TH_CUDA_STORAGE_IMPLEMENT_COPYTO(Double)
 
 #undef TH_CUDA_STORAGE_IMPLEMENT_COPY
 #undef TH_CUDA_STORAGE_IMPLEMENT_COPYTO
-#undef TH_CUDA_STORAGE_IMPLEMENT_COPY_AUX
-#undef TH_CUDA_STORAGE_IMPLEMENT_COPYTO_AUX
 
 #endif
diff --git a/lib/THC/generic/THCStorageCopy.h b/lib/THC/generic/THCStorageCopy.h
index f5311094..7a4ef6bb 100644
--- a/lib/THC/generic/THCStorageCopy.h
+++ b/lib/THC/generic/THCStorageCopy.h
@@ -22,7 +22,6 @@ THC_API void THCStorage_(copyCudaInt)(THCState *state, THCStorage *storage, stru
 THC_API void THCStorage_(copyCudaLong)(THCState *state, THCStorage *storage, struct THCudaLongStorage *src);
 THC_API void THCStorage_(copyCudaFloat)(THCState *state, THCStorage *storage, struct THCudaStorage *src);
 THC_API void THCStorage_(copyCudaDouble)(THCState *state, THCStorage *storage, struct THCudaDoubleStorage *src);
-
 #ifdef CUDA_HALF_TENSOR
 THC_API void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *storage, struct THCudaHalfStorage *src);
 #endif
diff --git a/lib/THC/generic/THCTensorCopy.h b/lib/THC/generic/THCTensorCopy.h
index 270d16e8..e549f09b 100644
--- a/lib/THC/generic/THCTensorCopy.h
+++ b/lib/THC/generic/THCTensorCopy.h
@@ -32,7 +32,6 @@ THC_API void TH_CONCAT_2(THLongTensor_copyCuda  , Real)  (THCState *state, THLon
 THC_API void TH_CONCAT_2(THFloatTensor_copyCuda , Real)  (THCState *state, THFloatTensor *self, THCTensor *src);
 THC_API void TH_CONCAT_2(THDoubleTensor_copyCuda, Real)  (THCState *state, THDoubleTensor *self, THCTensor *src);
 THC_API void TH_CONCAT_2(THHalfTensor_copyCuda, Real)    (THCState *state, THHalfTensor *self, THCTensor *src);
-
 THC_API void THCTensor_(copyCuda) (THCState *state, THCTensor *self, THCTensor *src);
 
 THC_API void THTensor_(copyCuda) (THCState *state, THTensor *self, THCTensor *src);
diff --git a/lib/THC/generic/THCTensorMathBlas.cu b/lib/THC/generic/THCTensorMathBlas.cu
index 63c9989f..545971f9 100644
--- a/lib/THC/generic/THCTensorMathBlas.cu
+++ b/lib/THC/generic/THCTensorMathBlas.cu
@@ -36,7 +36,7 @@ THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src)
 
 #else
   THError("unimplemented data type");
-  return ScalarConvert<int, accreal>::to(0);
+  return 0;
 #endif
 }
 
diff --git a/lib/THC/generic/THCTensorMathPairwise.cu b/lib/THC/generic/THCTensorMathPairwise.cu
index 0b4094b8..b7e82021 100644
--- a/lib/THC/generic/THCTensorMathPairwise.cu
+++ b/lib/THC/generic/THCTensorMathPairwise.cu
@@ -63,7 +63,7 @@ THC_API void
 THCTensor_(div)(THCState* state, THCTensor *self_, THCTensor *src_, real value)
 {
   THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
-  THArgCheck(value != ScalarConvert<int, real>::to(0), 3, "divide by zero");
+  THArgCheck(value != THCNumConstants<real>::zero(), 3, "divide by zero");
 
   if (self_ == src_) {
     if (!THC_pointwiseApply1(state, self_, TensorDivConstantOp<real>(value))) {
diff --git a/lib/THC/generic/THCTensorMathPointwise.cu b/lib/THC/generic/THCTensorMathPointwise.cu
index 755b1a34..b97908a2 100644
--- a/lib/THC/generic/THCTensorMathPointwise.cu
+++ b/lib/THC/generic/THCTensorMathPointwise.cu
@@ -263,7 +263,7 @@ THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, real value,
       // self += -value * src2
       if (!THC_pointwiseApply2(state, self_, src2,
                                    TensorCAddOp<real>(
-                                     THCNumerics<real>::neg(value)))) {
+                                     ScalarNegate<real>::to(value)))) {
         THArgCheck(false, 2, CUTORCH_DIM_WARNING);
       }
     }
@@ -279,7 +279,7 @@ THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, real value,
       // self = src1 - value * src2
       if (!THC_pointwiseApply3(state, self_, src1, src2,
                                    TensorCAddOp<real>(
-                                     THCNumerics<real>::neg(value)))) {
+                                     ScalarNegate<real>::to(value)))) {
         THArgCheck(false, 2, CUTORCH_DIM_WARNING);
       }
     }
diff --git a/lib/THC/generic/THCTensorMathReduce.cu b/lib/THC/generic/THCTensorMathReduce.cu
index aad7bceb..ed0e2049 100644
--- a/lib/THC/generic/THCTensorMathReduce.cu
+++ b/lib/THC/generic/THCTensorMathReduce.cu
@@ -301,7 +301,7 @@ THCTensor_(minall)(THCState *state, THCTensor *self) {
                      thrust::identity<real>(),
                      ReduceMin<real>(),
                      ReduceMin<real>(),
-                     THCNumConstants<typename THCNumerics<real>::storage_type>::max(), &val, 0)) {
+                     THCNumerics<real>::max(), &val, 0)) {
     THArgCheck(false, 1, CUTORCH_DIM_WARNING);
   }
 
@@ -317,7 +317,7 @@ THCTensor_(maxall)(THCState *state, THCTensor *self) {
                      thrust::identity<real>(),
                      ReduceMax<real>(),
                      ReduceMax<real>(),
-                     THCNumConstants<typename THCNumerics<real>::storage_type>::min(), &val, 0)) {
+                     THCNumerics<real>::min(), &val, 0)) {
     THArgCheck(false, 1, CUTORCH_DIM_WARNING);
   }
 
@@ -336,7 +336,7 @@ THCTensor_(max)(THCState *state,
   thrust::pair<typename TensorUtils<THCTensor>::DataType, long>
     init =
     thrust::make_pair<typename TensorUtils<THCTensor>::DataType, long>(
-      THCNumConstants<typename THCNumerics<typename TensorUtils<THCTensor>::DataType>::storage_type>::min(), 1);
+      THCNumerics<typename TensorUtils<THCTensor>::DataType>::min(), 1);
 
   return THC_reduceDimIndex(
     state, values, indices, src, dimension, init,
@@ -354,7 +354,7 @@ THCTensor_(min)(THCState *state,
   thrust::pair<typename TensorUtils<THCTensor>::DataType, long>
     init =
     thrust::make_pair<typename TensorUtils<THCTensor>::DataType, long>(
-      THCNumConstants<typename THCNumerics<typename TensorUtils<THCTensor>::DataType>::storage_type>::max(), 1);
+      THCNumerics<typename TensorUtils<THCTensor>::DataType>::max(), 1);
 
   return THC_reduceDimIndex(
     state, values, indices, src, dimension, init,

From 18f5ce19c42a7981782a4c6168d4e435ffa9b20e Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Sun, 15 Jan 2017 15:31:38 -0800
Subject: [PATCH 12/13] cleanup

---
 FFI.lua                              | 14 ++---
 Tensor.lua                           |  1 +
 lib/THC/THCHalf.c                    | 91 ----------------------------
 lib/THC/THCNumerics.cuh              | 33 +++++-----
 lib/THC/generic/THCTensorMathBlas.cu |  2 +-
 5 files changed, 20 insertions(+), 121 deletions(-)
 delete mode 100644 lib/THC/THCHalf.c

diff --git a/FFI.lua b/FFI.lua
index fb67c89b..b2777a2b 100644
--- a/FFI.lua
+++ b/FFI.lua
@@ -47,16 +47,10 @@ cudaStream_t THCState_getCurrentStream(THCState *state);
       {'long','Long'},
       {'double','Double'},
   }
-   if cutorch.hasHalf then
-      if cutorch.hasFastHalfInstructions() then
-         -- Enable native half math on Pascal plaforms where fp16 is efficient (GP100)
-         table.insert(CudaTypes, {'half','Half'})
-     else
-        -- on the rest (Maxwell and Pascal 6.1 (1080), resort to storage-only
-        -- (a.k.a 'pseudo') fp16 type (16-bit storage, float math via conversions)
-        table.insert(CudaTypes, {'half','Half', 'float'})
-      end
-   end
+  if cutorch.hasHalf then
+      table.insert(CudaTypes, {'half','Half'})
+  end
+
    for _, typedata in ipairs(CudaTypes) do
       local real, Real = unpack(typedata)
       local ctype_def = [[
diff --git a/Tensor.lua b/Tensor.lua
index a2724d54..0029291b 100644
--- a/Tensor.lua
+++ b/Tensor.lua
@@ -88,3 +88,4 @@ for ValueType, CudaTensorType in pairs(CudaTensorTypes) do
   end
   rawset(torch.getmetatable(CudaTensorType), 'totable', Tensor__totable)
 end
+
diff --git a/lib/THC/THCHalf.c b/lib/THC/THCHalf.c
deleted file mode 100644
index 0f567674..00000000
--- a/lib/THC/THCHalf.c
+++ /dev/null
@@ -1,91 +0,0 @@
-#include "THCHalf.h"
-
-float THC_half2float(half a)
-{
-  unsigned int bits = a.x & 0x7fff;
-  unsigned int sign = a.x & 0x8000;
-  unsigned int exp = a.x & 0x7c00;
-
-  bits <<= 13;
-  sign <<= 16;
-
-  bits += 0x38000000U;
-
-  // flush denormals to 0
-  bits = (exp == 0 ? 0 : bits) | sign;
-
-  union {
-    float f;
-    unsigned int v;
-  } conv;
-  conv.v = bits;
-
-  return conv.f;
-}
-
-/*
-  Copyright (c) 2015, Norbert Juffa
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-  1. Redistributions of source code must retain the above copyright
-     notice, this list of conditions and the following disclaimer.
-
-  2. Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the following disclaimer in the
-     documentation and/or other materials provided with the distribution.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-half THC_float2half(float a)
-{
-  uint32_t ia;
-  uint16_t ir;
-  memcpy(&ia, &a, sizeof(float));
-
-  ir = (ia >> 16) & 0x8000;
-  if ((ia & 0x7f800000) == 0x7f800000) {
-    if ((ia & 0x7fffffff) == 0x7f800000) {
-      ir |= 0x7c00; /* infinity */
-    } else {
-      ir = 0x7fff; /* canonical NaN */
-    }
-  } else if ((ia & 0x7f800000) >= 0x33000000) {
-    int shift = (int)((ia >> 23) & 0xff) - 127;
-    if (shift > 15) {
-      ir |= 0x7c00; /* infinity */
-    } else {
-      ia = (ia & 0x007fffff) | 0x00800000; /* extract mantissa */
-      if (shift < -14) { /* denormal */
-        ir |= ia >> (-1 - shift);
-        ia = ia << (32 - (-1 - shift));
-      } else { /* normal */
-        ir |= ia >> (24 - 11);
-        ia = ia << (32 - (24 - 11));
-        ir = ir + ((14 + shift) << 10);
-      }
-      /* IEEE-754 round to nearest of even */
-      if ((ia > 0x80000000) || ((ia == 0x80000000) && (ir & 1))) {
-        ir++;
-      }
-    }
-  }
-
-  half ret;
-  memcpy(&ret, &ir, sizeof(half));
-  return ret;
-}
diff --git a/lib/THC/THCNumerics.cuh b/lib/THC/THCNumerics.cuh
index a5d558c1..6b504e77 100644
--- a/lib/THC/THCNumerics.cuh
+++ b/lib/THC/THCNumerics.cuh
@@ -10,8 +10,6 @@ struct THCNumConstants
 {
    static THC_DECL const T one()  { return T(1); }
    static THC_DECL const T zero() { return T(0); }
-   static THC_DECL const T min()  { return std::numeric_limits<T>::min(); }
-   static THC_DECL const T max()  { return std::numeric_limits<T>::max(); }
 
 };
 
@@ -20,8 +18,6 @@ struct THCNumConstants<half>
 {
   static THC_DECL const half one()  { half ret = THC_FLOAT_TO_HALF(1.f); return ret;} /* TODO: use literal */
   static THC_DECL const half zero() { half ret; ret.x = 0;  return ret;}
-  static THC_DECL const half min() { half ret; ret.x = 0xFBFF;  return ret; }
-  static THC_DECL const half max() { half ret; ret.x = 0x7BFF;  return ret; }
 };
 
 template <typename T, typename M>
@@ -45,9 +41,6 @@ struct THCNumCommonBase {
     return ScalarConvert<expr_type, storage_type>::to(a);
   }
 
-  static THC_DECL const T min()  { return THCNumConstants<T>::min(); }
-  static THC_DECL const T max()  { return THCNumConstants<T>::max(); }
-
   static  THC_DECL bool lt(const storage_type&  a, const storage_type&  b) { return m_(a) < m_(b);  }
   static  THC_DECL bool le(const storage_type&  a, const storage_type&  b) { return m_(a) <= m_(b); }
   static  THC_DECL bool gt(const storage_type&  a, const storage_type&  b) { return m_(a) > m_(b);  }
@@ -63,7 +56,8 @@ struct THCNumCommonBase {
   static  THC_DECL  expr_type  neg(const storage_type& a) { return  e_(-m_(a)); }
   static  THC_DECL  expr_type  pow (const storage_type& a, T b) { return e_(::pow((double)a, (double)b)); }
   static  THC_DECL  expr_type  mod(const storage_type&  a, const storage_type&  b) { return e_(m_(a) % m_(b)); }
-
+   static THC_DECL const T min()  { return std::numeric_limits<T>::min(); }
+   static THC_DECL const T max()  { return std::numeric_limits<T>::max(); }
 };
 
 template <typename T, typename M, bool is_int>
@@ -88,6 +82,10 @@ struct THCNumBase<T, M, false> : public THCNumCommonBase<T, M> {
   using typename Base::expr_type;
   using typename Base::storage_type;
 
+  static THC_DECL const T min()  { return -std::numeric_limits<T>::max(); }
+  static THC_DECL const T max()  { return std::numeric_limits<T>::max(); }
+
+
   static  THC_DECL  expr_type exp  (const storage_type& a) { return  e_(::exp(m_(a))); }
   static  THC_DECL  expr_type log  (const storage_type& a) { return  e_(::log(m_(a))); }
   static  THC_DECL  expr_type log1p(const storage_type& a) { return  e_(::log1p(m_(a))); }
@@ -127,9 +125,13 @@ struct THCNumerics: public THCNumBase<T, T, std::numeric_limits<T>::is_integer>
 
 #ifdef CUDA_HALF_TENSOR
 
-#ifndef CUDA_HALF_INSTRUCTIONS
 template <>
 struct THCNumerics<half>: public THCNumBase<half, float, false>  {
+  static THC_DECL const half min() { half ret; ret.x = 0xFBFF;  return ret; }
+  static THC_DECL const half max() { half ret; ret.x = 0x7BFF;  return ret; }
+  typedef THCNumConstants<half> Constants;
+
+# ifndef CUDA_HALF_INSTRUCTIONS
   typedef THCNumCommonBase<half, float> Base;
   using typename Base::math_type;
   using typename Base::expr_type;
@@ -137,15 +139,8 @@ struct THCNumerics<half>: public THCNumBase<half, float, false>  {
   using Base::e_;
   using Base::m_;
   using Base::s_;
-  typedef THCNumConstants<half> Constants;
-};
-
-#else
-template <>
-struct THCNumerics<half>: public THCNumBase<half, float, false>  {
+# else
   typedef THCNumCommonBase<half, half> Base;
-  typedef THCNumConstants<half> Constants;
-
   typedef typename Base::storage_type storage_type;
   typedef typename Base::math_type    math_type;
   typedef typename Base::expr_type    expr_type;
@@ -232,9 +227,9 @@ struct THCNumerics<half>: public THCNumBase<half, float, false>  {
   static  THC_DECL half mod  (const half& a, const half& b) { 
     return __float2half(fmodf(__half2float(a), __half2float(b)));
   }
-
-};
 # endif
+};
+
 #endif
 
 #endif // THC_NUMERICS_INC
diff --git a/lib/THC/generic/THCTensorMathBlas.cu b/lib/THC/generic/THCTensorMathBlas.cu
index 545971f9..63c9989f 100644
--- a/lib/THC/generic/THCTensorMathBlas.cu
+++ b/lib/THC/generic/THCTensorMathBlas.cu
@@ -36,7 +36,7 @@ THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src)
 
 #else
   THError("unimplemented data type");
-  return 0;
+  return ScalarConvert<int, accreal>::to(0);
 #endif
 }
 

From b6771be0b8f72f1ff4fba98dd9d88700cb0eeb0d Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <bfomitchev@nvidia.com>
Date: Sun, 15 Jan 2017 16:03:40 -0800
Subject: [PATCH 13/13] Enabling half instructions for selected Pascal archs

---
 lib/THC/THCHalf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h
index 5e35694e..dbeb4311 100644
--- a/lib/THC/THCHalf.h
+++ b/lib/THC/THCHalf.h
@@ -41,7 +41,7 @@ THC_API int THC_fastHalfInstructions(THCState *state);
 #   define THC_DECL inline
 #  endif
 
-#if 0 // __CUDA_ARCH__ == 600 || __CUDA_ARCH__ >= 620
+#if __CUDA_ARCH__ == 600 || __CUDA_ARCH__ >= 620
 # define CUDA_HALF_INSTRUCTIONS 1
 #endif