From caf5a613c1199bf034f0a3a262387195ffbc2e9c Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Fri, 20 May 2016 16:43:10 -0400
Subject: [PATCH 01/71] fix compilation issues under Windows

---
 lib/TH/THMemoryFile.c           |  4 ++--
 lib/TH/THRandom.c               |  2 +-
 lib/TH/generic/THTensorLapack.c | 16 ++++++++--------
 lib/TH/generic/THTensorMath.c   | 27 +++++++++++----------------
 4 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/lib/TH/THMemoryFile.c b/lib/TH/THMemoryFile.c
index d39b84179..c669edcc4 100644
--- a/lib/TH/THMemoryFile.c
+++ b/lib/TH/THMemoryFile.c
@@ -177,7 +177,7 @@ static int THMemoryFile_mode(const char *mode, int *isReadable, int *isWritable)
       size_t i;                                                           \
       for(i = 0; i < n; i++)                                            \
       {                                                                 \
-        ssize_t nByteWritten;                                           \
+        size_t nByteWritten;                                           \
         while (1)                                                       \
         {                                                               \
           ASCII_WRITE_ELEM;                                             \
@@ -479,7 +479,7 @@ static size_t THMemoryFile_writeLong(THFile *self, long *data, size_t n)
     size_t i;
     for(i = 0; i < n; i++)
     {
-      ssize_t nByteWritten;
+      size_t nByteWritten;
       while (1)
       {
         nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%ld", data[i]);
diff --git a/lib/TH/THRandom.c b/lib/TH/THRandom.c
index 55ee9438c..b1f66a3c3 100644
--- a/lib/TH/THRandom.c
+++ b/lib/TH/THRandom.c
@@ -57,7 +57,7 @@ static unsigned long readURandomLong()
   if (randDev < 0) {
     THError("Unable to open /dev/urandom");
   }
-  ssize_t readBytes = read(randDev, &randValue, sizeof(randValue));
+  size_t readBytes = read(randDev, &randValue, sizeof(randValue));
   if (readBytes < sizeof(randValue)) {
     THError("Unable to read from /dev/urandom");
   }
diff --git a/lib/TH/generic/THTensorLapack.c b/lib/TH/generic/THTensorLapack.c
index b7ba0f158..62d730a83 100644
--- a/lib/TH/generic/THTensorLapack.c
+++ b/lib/TH/generic/THTensorLapack.c
@@ -218,7 +218,7 @@ void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
                            THCleanup(THTensor_(free)(ra__);
                                      THTensor_(free)(rb__);
                                      THTensor_(free)(work);),
-                           "gels", info);
+                           "gels", info,"");
 
   /* rb__ is currently ldb by nrhs; resize it to n by nrhs */
   rb__->size[0] = n;
@@ -283,7 +283,7 @@ void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *job
                                      THTensor_(free)(wi);
                                      THTensor_(free)(wr);
                                      THTensor_(free)(work);),
-                           "geev", info);
+                           "geev", info,"");
 
   {
     real *re_data = THTensor_(data)(re__);
@@ -340,7 +340,7 @@ void THTensor_(syev)(THTensor *re_, THTensor *rv_, THTensor *a, const char *jobz
                            THCleanup(THTensor_(free)(rv__);
                                      THTensor_(free)(re__);
                                      THTensor_(free)(work);),
-                           "syev", info);
+                           "syev", info,"");
 
   THTensor_(freeCopyTo)(rv__, rv_);
   THTensor_(freeCopyTo)(re__, re_);
@@ -417,7 +417,7 @@ void THTensor_(gesvd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra
                                THTensor_(free)(rv__);
                                THTensor_(free)(ra__);
                                THTensor_(free)(work);),
-                           "gesvd", info);
+                           "gesvd", info,"");
 
   if (*jobu == 'S')
     THTensor_(narrow)(rv__,NULL,1,0,k);
@@ -666,7 +666,7 @@ void THTensor_(pstrf)(THTensor *ra_, THIntTensor *rpiv_, THTensor *a, const char
                            THCleanup(
                                THTensor_(free)(ra__);
                                THTensor_(free)(work);),
-                           "pstrf", info);
+                           "pstrf", info,"");
 
   THTensor_(clearUpLoTriangle)(ra__, uplo);
 
@@ -760,7 +760,7 @@ void THTensor_(geqrf)(THTensor *ra_, THTensor *rtau_, THTensor *a)
                            THCleanup(
                                THTensor_(free)(ra__);
                                THTensor_(free)(work);),
-                           "geqrf", info);
+                           "geqrf", info,"");
 
   THTensor_(freeCopyTo)(ra__, ra_);
   THTensor_(free)(work);
@@ -813,7 +813,7 @@ void THTensor_(orgqr)(THTensor *ra_, THTensor *a, THTensor *tau)
                            THCleanup(
                                THTensor_(free)(ra__);
                                THTensor_(free)(work);),
-                           "orgqr", info);
+                           "orgqr", info,"");
   THTensor_(freeCopyTo)(ra__, ra_);
   THTensor_(free)(work);
 }
@@ -876,7 +876,7 @@ void THTensor_(ormqr)(THTensor *ra_, THTensor *a, THTensor *tau, THTensor *c, co
                            THCleanup(
                                THTensor_(free)(ra__);
                                THTensor_(free)(work);),
-                           "ormqr", info);
+                           "ormqr", info,"");
   THTensor_(freeCopyTo)(ra__, ra_);
   THTensor_(free)(work);
 }
diff --git a/lib/TH/generic/THTensorMath.c b/lib/TH/generic/THTensorMath.c
index 901010356..57ac17e39 100644
--- a/lib/TH/generic/THTensorMath.c
+++ b/lib/TH/generic/THTensorMath.c
@@ -385,6 +385,13 @@ accreal THTensor_(dot)(THTensor *tensor, THTensor *src)
   return sum;
 }
 
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+#define th_isnan(val) \
+if (isnan(value)) break;
+#else
+#define th_isnan(val)
+#endif
+
 real THTensor_(minall)(THTensor *tensor)
 {
   real theMin;
@@ -398,10 +405,7 @@ real THTensor_(minall)(THTensor *tensor)
                   if(!(value >= theMin))
                   {
                     theMin = value;
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-                    if (isnan(value))
-                      break;
-#endif
+                    th_isnan(value)
                   });
   return theMin;
 }
@@ -419,10 +423,7 @@ real THTensor_(maxall)(THTensor *tensor)
                   if(!(value <= theMax))
                   {
                     theMax = value;
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-                    if (isnan(value))
-                      break;
-#endif
+                    th_isnan(value)
                   });
   return theMax;
 }
@@ -1079,10 +1080,7 @@ void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int
                          {
                            theIndex = i;
                            theMax = value;
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-                           if (isnan(value))
-                             break;
-#endif
+                           th_isnan(value)
                          }
                        }
                        *indices__data = theIndex;
@@ -1118,10 +1116,7 @@ void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int
                          {
                            theIndex = i;
                            theMin = value;
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-                           if (isnan(value))
-                             break;
-#endif
+                           th_isnan(value)
                          }
                        }
                        *indices__data = theIndex;

From 8fe85b56ddeee181affb5ffda75ed563a02a8eeb Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Fri, 20 May 2016 16:47:30 -0400
Subject: [PATCH 02/71] Update torch-scm-1.rockspec

---
 rocks/torch-scm-1.rockspec | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rocks/torch-scm-1.rockspec b/rocks/torch-scm-1.rockspec
index 222872649..56bf18064 100644
--- a/rocks/torch-scm-1.rockspec
+++ b/rocks/torch-scm-1.rockspec
@@ -2,14 +2,14 @@ package = "torch"
 version = "scm-1"
 
 source = {
-   url = "git://github.com/torch/torch7.git",
+   url = "git://github.com/elikosan/torch7.git",
 }
 
 description = {
    summary = "Torch7",
    detailed = [[
    ]],
-   homepage = "https://github.com/torch/torch7",
+   homepage = "https://github.com/elikosan/torch7",
    license = "BSD"
 }
 

From c7fbe8f6f9593b4fcb3abc5ee54bd79e02b77139 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Fri, 20 May 2016 17:08:00 -0400
Subject: [PATCH 03/71] m.lib not found/necessary under Windows

---
 lib/TH/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/TH/CMakeLists.txt b/lib/TH/CMakeLists.txt
index 200cdc12b..551ea50f5 100644
--- a/lib/TH/CMakeLists.txt
+++ b/lib/TH/CMakeLists.txt
@@ -223,7 +223,9 @@ IF (UNIX AND NOT APPLE)
    ENDIF(NEED_LIBRT)
 ENDIF(UNIX AND NOT APPLE)
 
-TARGET_LINK_LIBRARIES(TH m)
+IF(NOT MSVC)
+  TARGET_LINK_LIBRARIES(TH m)
+ENDIF(NOT MSVC)
 
 SET(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
 FOREACH(KEYWORD "inline" "__inline__" "__inline")

From 8efe1597207721ef794f1074db303d0a9a8d7b35 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Fri, 27 May 2016 16:00:46 -0400
Subject: [PATCH 04/71] remove m.lib from list of required MKL blas libs for
 Windows

---
 lib/TH/cmake/FindMKL.cmake | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/lib/TH/cmake/FindMKL.cmake b/lib/TH/cmake/FindMKL.cmake
index 4099fee10..1b55d3d46 100644
--- a/lib/TH/cmake/FindMKL.cmake
+++ b/lib/TH/cmake/FindMKL.cmake
@@ -137,6 +137,13 @@ MACRO(CHECK_ALL_LIBRARIES LIBRARIES _name _list _flags)
   ENDIF(_libraries_work)
 ENDMACRO(CHECK_ALL_LIBRARIES)
 
+if (WIN32)
+  set(mkl_m "")
+else
+  set(mkl_m "m")
+endif
+
+
 # Check for version 10/11
 IF (NOT MKL_LIBRARIES)
   SET(MKL_VERSION 1011)
@@ -147,7 +154,7 @@ FOREACH(mklrtl ${mklrtls} "")
       FOREACH(mklthread ${mklthreads})
         IF (NOT MKL_LIBRARIES AND NOT INTEL_MKL_SEQUENTIAL)
           CHECK_ALL_LIBRARIES(MKL_LIBRARIES cblas_sgemm
-            "mkl_${mkliface}${mkl64};${mklthread};mkl_core;${mklrtl};pthread;m" "")
+            "mkl_${mkliface}${mkl64};${mklthread};mkl_core;${mklrtl};pthread;${mkl_m}" "")
         ENDIF (NOT MKL_LIBRARIES AND NOT INTEL_MKL_SEQUENTIAL)          
       ENDFOREACH(mklthread)
     ENDFOREACH(mkl64)
@@ -158,7 +165,7 @@ FOREACH(mklrtl ${mklrtls} "")
     FOREACH(mkl64 ${mkl64s} "")
       IF (NOT MKL_LIBRARIES)
         CHECK_ALL_LIBRARIES(MKL_LIBRARIES cblas_sgemm
-          "mkl_${mkliface}${mkl64};mkl_sequential;mkl_core;m" "")
+          "mkl_${mkliface}${mkl64};mkl_sequential;mkl_core;${mkl_m}" "")
         IF (MKL_LIBRARIES)
           SET(mklseq "_sequential")
         ENDIF (MKL_LIBRARIES)
@@ -172,7 +179,7 @@ FOREACH(mklrtl ${mklrtls} "")
       FOREACH(mklthread ${mklthreads})
         IF (NOT MKL_LIBRARIES)
           CHECK_ALL_LIBRARIES(MKL_LIBRARIES cblas_sgemm
-            "mkl_${mkliface}${mkl64};${mklthread};mkl_core;${mklrtl};pthread;m" "")
+            "mkl_${mkliface}${mkl64};${mklthread};mkl_core;${mklrtl};pthread;${mkl_m}" "")
         ENDIF (NOT MKL_LIBRARIES)          
       ENDFOREACH(mklthread)
     ENDFOREACH(mkl64)

From 4d6ea12a3bafbddc2a9e1eb681fd83e9d8f78c53 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Fri, 27 May 2016 16:09:50 -0400
Subject: [PATCH 05/71] FindMKL.cmake typo

---
 lib/TH/cmake/FindMKL.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/TH/cmake/FindMKL.cmake b/lib/TH/cmake/FindMKL.cmake
index 1b55d3d46..8dc3cde62 100644
--- a/lib/TH/cmake/FindMKL.cmake
+++ b/lib/TH/cmake/FindMKL.cmake
@@ -137,11 +137,11 @@ MACRO(CHECK_ALL_LIBRARIES LIBRARIES _name _list _flags)
   ENDIF(_libraries_work)
 ENDMACRO(CHECK_ALL_LIBRARIES)
 
-if (WIN32)
+if(WIN32)
   set(mkl_m "")
-else
+else(WIN32)
   set(mkl_m "m")
-endif
+endif(WIN32)
 
 
 # Check for version 10/11

From 62e51bedb47e27331c85fbdd2bdd1b71f6c32fb6 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Tue, 31 May 2016 13:54:36 -0400
Subject: [PATCH 06/71] restore torch in rockspec

---
 rocks/torch-scm-1.rockspec | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rocks/torch-scm-1.rockspec b/rocks/torch-scm-1.rockspec
index 56bf18064..222872649 100644
--- a/rocks/torch-scm-1.rockspec
+++ b/rocks/torch-scm-1.rockspec
@@ -2,14 +2,14 @@ package = "torch"
 version = "scm-1"
 
 source = {
-   url = "git://github.com/elikosan/torch7.git",
+   url = "git://github.com/torch/torch7.git",
 }
 
 description = {
    summary = "Torch7",
    detailed = [[
    ]],
-   homepage = "https://github.com/elikosan/torch7",
+   homepage = "https://github.com/torch/torch7",
    license = "BSD"
 }
 

From 76cb36c40bb982d4aca10c141669ea2ced4e317f Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Wed, 1 Jun 2016 16:11:21 -0400
Subject: [PATCH 07/71] define ssize_t on Windows

---
 lib/TH/THGeneral.h.in | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/TH/THGeneral.h.in b/lib/TH/THGeneral.h.in
index 3b62e4a04..5c19da25f 100644
--- a/lib/TH/THGeneral.h.in
+++ b/lib/TH/THGeneral.h.in
@@ -109,6 +109,8 @@ do {                                                                  \
 #define snprintf _snprintf
 #define popen _popen
 #define pclose _pclose
+#include <BaseTsd.h>
+typedef SSIZE_T ssize_t;
 #endif
 
 #endif

From 3bce1c8a13ddd50bac6dc48911d5e4e33a6ad569 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Wed, 1 Jun 2016 16:12:20 -0400
Subject: [PATCH 08/71] define ssize_t on Windows

---
 lib/TH/THMemoryFile.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/TH/THMemoryFile.c b/lib/TH/THMemoryFile.c
index c669edcc4..d39b84179 100644
--- a/lib/TH/THMemoryFile.c
+++ b/lib/TH/THMemoryFile.c
@@ -177,7 +177,7 @@ static int THMemoryFile_mode(const char *mode, int *isReadable, int *isWritable)
       size_t i;                                                           \
       for(i = 0; i < n; i++)                                            \
       {                                                                 \
-        size_t nByteWritten;                                           \
+        ssize_t nByteWritten;                                           \
         while (1)                                                       \
         {                                                               \
           ASCII_WRITE_ELEM;                                             \
@@ -479,7 +479,7 @@ static size_t THMemoryFile_writeLong(THFile *self, long *data, size_t n)
     size_t i;
     for(i = 0; i < n; i++)
     {
-      size_t nByteWritten;
+      ssize_t nByteWritten;
       while (1)
       {
         nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%ld", data[i]);

From be729d69b32579dc4c44259b000a02f432930620 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Wed, 1 Jun 2016 16:12:56 -0400
Subject: [PATCH 09/71] define ssize_t on Windows

---
 lib/TH/THRandom.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/TH/THRandom.c b/lib/TH/THRandom.c
index b1f66a3c3..55ee9438c 100644
--- a/lib/TH/THRandom.c
+++ b/lib/TH/THRandom.c
@@ -57,7 +57,7 @@ static unsigned long readURandomLong()
   if (randDev < 0) {
     THError("Unable to open /dev/urandom");
   }
-  size_t readBytes = read(randDev, &randValue, sizeof(randValue));
+  ssize_t readBytes = read(randDev, &randValue, sizeof(randValue));
   if (readBytes < sizeof(randValue)) {
     THError("Unable to read from /dev/urandom");
   }

From 89ab8e4614f283f3722f6da73e38318f4455d52d Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Tue, 6 Dec 2016 15:26:57 -0500
Subject: [PATCH 10/71] support for MKL ilp64

---
 lib/TH/generic/THBlas.c | 69 +++++++++++++++++++++++------------------
 1 file changed, 39 insertions(+), 30 deletions(-)

diff --git a/lib/TH/generic/THBlas.c b/lib/TH/generic/THBlas.c
index 759689f02..7dbb88aa1 100644
--- a/lib/TH/generic/THBlas.c
+++ b/lib/TH/generic/THBlas.c
@@ -8,6 +8,15 @@
 # define ffloat float
 #endif
 
+#define MKL_IL64
+
+#ifdef MKL_ILP64 
+#define BLAS_INT long  
+#else
+#define BLAS_INT int
+#endif  
+
+
 TH_EXTERNC void dswap_(int *n, double *x, int *incx, double *y, int *incy);
 TH_EXTERNC void sswap_(int *n, float *x, int *incx, float *y, int *incy);
 TH_EXTERNC void dscal_(int *n, double *a, double *x, int *incx);
@@ -38,9 +47,9 @@ void THBlas_(swap)(long n, real *x, long incx, real *y, long incy)
 #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
   if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
   {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
+    BLAS_INT i_n = (BLAS_INT)n;
+    BLAS_INT i_incx = (BLAS_INT)incx;
+    BLAS_INT i_incy = (BLAS_INT)incy;
 
 #if defined(TH_REAL_IS_DOUBLE)
     dswap_(&i_n, x, &i_incx, y, &i_incy);
@@ -69,8 +78,8 @@ void THBlas_(scal)(long n, real a, real *x, long incx)
 #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
   if( (n <= INT_MAX) && (incx <= INT_MAX) )
   {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
+    BLAS_INT i_n = (BLAS_INT)n;
+    BLAS_INT i_incx = (BLAS_INT)incx;
 
 #if defined(TH_REAL_IS_DOUBLE)
     dscal_(&i_n, &a, x, &i_incx);
@@ -98,9 +107,9 @@ void THBlas_(copy)(long n, real *x, long incx, real *y, long incy)
 #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
   if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
   {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
+    BLAS_INT i_n = (BLAS_INT)n;
+    BLAS_INT i_incx = (BLAS_INT)incx;
+    BLAS_INT i_incy = (BLAS_INT)incy;
 
 #if defined(TH_REAL_IS_DOUBLE)
     dcopy_(&i_n, x, &i_incx, y, &i_incy);
@@ -128,9 +137,9 @@ void THBlas_(axpy)(long n, real a, real *x, long incx, real *y, long incy)
 #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
   if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
   {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
+    BLAS_INT i_n = (BLAS_INT)n;
+    BLAS_INT i_incx = (BLAS_INT)incx;
+    BLAS_INT i_incy = (BLAS_INT)incy;
 
 #if defined(TH_REAL_IS_DOUBLE)
     daxpy_(&i_n, &a, x, &i_incx, y, &i_incy);
@@ -158,9 +167,9 @@ real THBlas_(dot)(long n, real *x, long incx, real *y, long incy)
 #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
   if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
   {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
+    BLAS_INT i_n = (BLAS_INT)n;
+    BLAS_INT i_incx = (BLAS_INT)incx;
+    BLAS_INT i_incy = (BLAS_INT)incy;
 
 #if defined(TH_REAL_IS_DOUBLE)
     return (real) ddot_(&i_n, x, &i_incx, y, &i_incy);
@@ -189,11 +198,11 @@ void THBlas_(gemv)(char trans, long m, long n, real alpha, real *a, long lda, re
       (incx > 0) && (incx <= INT_MAX) &&
       (incy > 0) && (incy <= INT_MAX) )
   {
-    int i_m = (int)m;
-    int i_n = (int)n;
-    int i_lda = (int)lda;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
+    BLAS_INT i_m = (BLAS_INT)m;
+    BLAS_INT i_n = (BLAS_INT)n;
+    BLAS_INT i_lda = (BLAS_INT)lda;
+    BLAS_INT i_incx = (BLAS_INT)incx;
+    BLAS_INT i_incy = (BLAS_INT)incy;
 
 #if defined(TH_REAL_IS_DOUBLE)
     dgemv_(&trans, &i_m, &i_n, &alpha, a, &i_lda, x, &i_incx, &beta, y, &i_incy);
@@ -241,11 +250,11 @@ void THBlas_(ger)(long m, long n, real alpha, real *x, long incx, real *y, long
 #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
   if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX)  && (incx <= INT_MAX) && (incy <= INT_MAX) )
   {
-    int i_m = (int)m;
-    int i_n = (int)n;
-    int i_lda = (int)lda;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
+    BLAS_INT i_m = (BLAS_INT)m;
+    BLAS_INT i_n = (BLAS_INT)n;
+    BLAS_INT i_lda = (BLAS_INT)lda;
+    BLAS_INT i_incx = (BLAS_INT)incx;
+    BLAS_INT i_incy = (BLAS_INT)incy;
 
 #if defined(TH_REAL_IS_DOUBLE)
     dger_(&i_m, &i_n, &alpha, x, &i_incx, y, &i_incy, a, &i_lda);
@@ -300,12 +309,12 @@ void THBlas_(gemm)(char transa, char transb, long m, long n, long k, real alpha,
 #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
   if( (m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (lda <= INT_MAX)  && (ldb <= INT_MAX) && (ldc <= INT_MAX) )
   {
-    int i_m = (int)m;
-    int i_n = (int)n;
-    int i_k = (int)k;
-    int i_lda = (int)lda;
-    int i_ldb = (int)ldb;
-    int i_ldc = (int)ldc;
+    BLAS_INT i_m = (BLAS_INT)m;
+    BLAS_INT i_n = (BLAS_INT)n;
+    BLAS_INT i_k = (BLAS_INT)k;
+    BLAS_INT i_lda = (BLAS_INT)lda;
+    BLAS_INT i_ldb = (BLAS_INT)ldb;
+    BLAS_INT i_ldc = (BLAS_INT)ldc;
 
 #if defined(TH_REAL_IS_DOUBLE)
     dgemm_(&transa, &transb, &i_m, &i_n, &i_k, &alpha, a, &i_lda, b, &i_ldb, &beta, c, &i_ldc);

From 933ff336a03891d4171a52f307a3aa617e4199f6 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Tue, 6 Dec 2016 15:27:56 -0500
Subject: [PATCH 11/71] support for MKL ilp64

---
 lib/TH/cmake/FindMKL.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/TH/cmake/FindMKL.cmake b/lib/TH/cmake/FindMKL.cmake
index 8dc3cde62..7bb8e7ff8 100644
--- a/lib/TH/cmake/FindMKL.cmake
+++ b/lib/TH/cmake/FindMKL.cmake
@@ -41,7 +41,7 @@ CHECK_TYPE_SIZE("void*" SIZE_OF_VOIDP)
 IF ("${SIZE_OF_VOIDP}" EQUAL 8)
   SET(mklvers "em64t")
   SET(iccvers "intel64")
-  SET(mkl64s "_lp64")
+  SET(mkl64s "_ilp64")
 ELSE ("${SIZE_OF_VOIDP}" EQUAL 8)
   SET(mklvers "32")
   SET(iccvers "ia32")

From 2118356000eff7c7ce78b0cedbfc7975ff97498d Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Wed, 7 Dec 2016 16:54:10 -0500
Subject: [PATCH 12/71] blas decl compat with ilp64 model

---
 lib/TH/generic/THBlas.c | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/lib/TH/generic/THBlas.c b/lib/TH/generic/THBlas.c
index 7dbb88aa1..4cd05f94a 100644
--- a/lib/TH/generic/THBlas.c
+++ b/lib/TH/generic/THBlas.c
@@ -16,24 +16,22 @@
 #define BLAS_INT int
 #endif  
 
-
-TH_EXTERNC void dswap_(int *n, double *x, int *incx, double *y, int *incy);
-TH_EXTERNC void sswap_(int *n, float *x, int *incx, float *y, int *incy);
-TH_EXTERNC void dscal_(int *n, double *a, double *x, int *incx);
-TH_EXTERNC void sscal_(int *n, float *a, float *x, int *incx);
-TH_EXTERNC void dcopy_(int *n, double *x, int *incx, double *y, int *incy);
-TH_EXTERNC void scopy_(int *n, float *x, int *incx, float *y, int *incy);
-TH_EXTERNC void daxpy_(int *n, double *a, double *x, int *incx, double *y, int *incy);
-TH_EXTERNC void saxpy_(int *n, float *a, float *x, int *incx, float *y, int *incy);
-TH_EXTERNC double ddot_(int *n, double *x, int *incx, double *y, int *incy);
-TH_EXTERNC ffloat sdot_(int *n, float *x, int *incx, float *y, int *incy);
-TH_EXTERNC void dgemv_(char *trans, int *m, int *n, double *alpha, double *a, int *lda, double *x, int *incx, double *beta, double *y, int *incy);
-TH_EXTERNC void sgemv_(char *trans, int *m, int *n, float *alpha, float *a, int *lda, float *x, int *incx, float *beta, float *y, int *incy);
-TH_EXTERNC void dger_(int *m, int *n, double *alpha, double *x, int *incx, double *y, int *incy, double *a, int *lda);
-TH_EXTERNC void sger_(int *m, int *n, float *alpha, float *x, int *incx, float *y, int *incy, float *a, int *lda);
-TH_EXTERNC void dgemm_(char *transa, char *transb, int *m, int *n, int *k, double *alpha, double *a, int *lda, double *b, int *ldb, double *beta, double *c, int *ldc);
-TH_EXTERNC void sgemm_(char *transa, char *transb, int *m, int *n, int *k, float *alpha, float *a, int *lda, float *b, int *ldb, float *beta, float *c, int *ldc);
-    
+TH_EXTERNC void dswap_(BLAS_INT *n, double *x, BLAS_INT *incx, double *y, BLAS_INT *incy);
+TH_EXTERNC void sswap_(BLAS_INT *n, float *x, BLAS_INT *incx, float *y, BLAS_INT *incy);
+TH_EXTERNC void dscal_(BLAS_INT *n, double *a, double *x, BLAS_INT *incx);
+TH_EXTERNC void sscal_(BLAS_INT *n, float *a, float *x, BLAS_INT *incx);
+TH_EXTERNC void dcopy_(BLAS_INT *n, double *x, BLAS_INT *incx, double *y, BLAS_INT *incy);
+TH_EXTERNC void scopy_(BLAS_INT *n, float *x, BLAS_INT *incx, float *y, BLAS_INT *incy);
+TH_EXTERNC void daxpy_(BLAS_INT *n, double *a, double *x, BLAS_INT *incx, double *y, BLAS_INT *incy);
+TH_EXTERNC void saxpy_(BLAS_INT *n, float *a, float *x, BLAS_INT *incx, float *y, BLAS_INT *incy);
+TH_EXTERNC double ddot_(BLAS_INT *n, double *x, BLAS_INT *incx, double *y, BLAS_INT *incy);
+TH_EXTERNC ffloat sdot_(BLAS_INT *n, float *x, BLAS_INT *incx, float *y, BLAS_INT *incy);
+TH_EXTERNC void dgemv_(char *trans, BLAS_INT *m, BLAS_INT *n, double *alpha, double *a, BLAS_INT *lda, double *x, BLAS_INT *incx, double *beta, double *y, BLAS_INT *incy);
+TH_EXTERNC void sgemv_(char *trans, BLAS_INT *m, BLAS_INT *n, float *alpha, float *a, BLAS_INT *lda, float *x, BLAS_INT *incx, float *beta, float *y, BLAS_INT *incy);
+TH_EXTERNC void dger_(BLAS_INT *m, BLAS_INT *n, double *alpha, double *x, BLAS_INT *incx, double *y, BLAS_INT *incy, double *a, BLAS_INT *lda);
+TH_EXTERNC void sger_(BLAS_INT *m, BLAS_INT *n, float *alpha, float *x, BLAS_INT *incx, float *y, BLAS_INT *incy, float *a, BLAS_INT *lda);
+TH_EXTERNC void dgemm_(char *transa, char *transb, BLAS_INT *m, BLAS_INT *n, BLAS_INT *k, double *alpha, double *a, BLAS_INT *lda, double *b, BLAS_INT *ldb, double *beta, double *c, BLAS_INT *ldc);
+TH_EXTERNC void sgemm_(char *transa, char *transb, BLAS_INT *m, BLAS_INT *n, BLAS_INT *k, float *alpha, float *a, BLAS_INT *lda, float *b, BLAS_INT *ldb, float *beta, float *c, BLAS_INT *ldc);
  
 
 void THBlas_(swap)(long n, real *x, long incx, real *y, long incy)

From c8951bd52c78e239a914d63f36bfc172eee02f52 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Wed, 7 Dec 2016 17:18:01 -0500
Subject: [PATCH 13/71] make ilp64 patch work on Windows

---
 lib/TH/generic/THBlas.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/lib/TH/generic/THBlas.c b/lib/TH/generic/THBlas.c
index 4cd05f94a..16358d4dd 100644
--- a/lib/TH/generic/THBlas.c
+++ b/lib/TH/generic/THBlas.c
@@ -11,10 +11,15 @@
 #define MKL_IL64
 
 #ifdef MKL_ILP64 
-#define BLAS_INT long  
+ #ifdef WIN32
+  #define BLAS_INT __int64 
+ #else
+  #define BLAS_INT long 
+ #endif
 #else
-#define BLAS_INT int
-#endif  
+ #define BLAS_INT int
+#endif
+
 
 TH_EXTERNC void dswap_(BLAS_INT *n, double *x, BLAS_INT *incx, double *y, BLAS_INT *incy);
 TH_EXTERNC void sswap_(BLAS_INT *n, float *x, BLAS_INT *incx, float *y, BLAS_INT *incy);

From e4a8557333e6733249b633dbd114a519b85cdc02 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Wed, 7 Dec 2016 18:06:44 -0500
Subject: [PATCH 14/71] define MKL_ILP64

---
 lib/TH/generic/THBlas.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/TH/generic/THBlas.c b/lib/TH/generic/THBlas.c
index 16358d4dd..04d92003c 100644
--- a/lib/TH/generic/THBlas.c
+++ b/lib/TH/generic/THBlas.c
@@ -8,7 +8,7 @@
 # define ffloat float
 #endif
 
-#define MKL_IL64
+#define MKL_ILP64
 
 #ifdef MKL_ILP64 
  #ifdef WIN32

From ba78c0fcc57d1a4ec94e4d967eaa58a705bef5f0 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Mon, 12 Dec 2016 17:19:29 -0500
Subject: [PATCH 15/71] avoid crashing on Windows with ilp64

---
 lib/TH/cmake/FindBLAS.cmake | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/lib/TH/cmake/FindBLAS.cmake b/lib/TH/cmake/FindBLAS.cmake
index 2188fc724..ded8d5825 100644
--- a/lib/TH/cmake/FindBLAS.cmake
+++ b/lib/TH/cmake/FindBLAS.cmake
@@ -242,32 +242,44 @@ endif()
 # Determine if blas was compiled with the f2c conventions
 IF (BLAS_LIBRARIES)
   SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
+  
   CHECK_C_SOURCE_RUNS("
 #include <stdlib.h>
 #include <stdio.h>
 float x[4] = { 1, 2, 3, 4 };
 float y[4] = { .1, .01, .001, .0001 };
-int four = 4;
-int one = 1;
+#ifdef WIN32
+  typedef __int64 BLINT;
+#else
+  typedef long BLINT;
+#endif
+BLINT four = 4;
+BLINT one = 1;
 extern double sdot_();
 int main() {
-  int i;
   double r = sdot_(&four, x, &one, y, &one);
   exit((float)r != (float).1234);
 }" BLAS_F2C_DOUBLE_WORKS )
+
   CHECK_C_SOURCE_RUNS("
 #include <stdlib.h>
 #include <stdio.h>
 float x[4] = { 1, 2, 3, 4 };
 float y[4] = { .1, .01, .001, .0001 };
-int four = 4;
-int one = 1;
+#ifdef WIN32
+  typedef __int64 BLINT;
+#else
+  typedef long BLINT;
+#endif
+BLINT four = 4;
+BLINT one = 1;
 extern float sdot_();
 int main() {
   int i;
   double r = sdot_(&four, x, &one, y, &one);
   exit((float)r != (float).1234);
 }" BLAS_F2C_FLOAT_WORKS )
+
   IF (BLAS_F2C_DOUBLE_WORKS AND NOT BLAS_F2C_FLOAT_WORKS)
     MESSAGE(STATUS "This BLAS uses the F2C return conventions")
     SET(BLAS_F2C TRUE)

From 8ed047a566a25398249c6efae5b8c747bb7df2db Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Mon, 12 Dec 2016 17:28:27 -0500
Subject: [PATCH 16/71] use MKL_LP64 compiler flag to trigger 32bit ints on
 64bits platforms

---
 lib/TH/generic/THBlas.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/lib/TH/generic/THBlas.c b/lib/TH/generic/THBlas.c
index 04d92003c..8dd8047c5 100644
--- a/lib/TH/generic/THBlas.c
+++ b/lib/TH/generic/THBlas.c
@@ -8,15 +8,16 @@
 # define ffloat float
 #endif
 
-#define MKL_ILP64
-
-#ifdef MKL_ILP64 
+// define MKL_LP64 to get 32bit ints on 64bit platforms
+#ifndef MKL_LP64 
+ // 64bit ints
  #ifdef WIN32
   #define BLAS_INT __int64 
  #else
   #define BLAS_INT long 
  #endif
 #else
+ // 32bit ints
  #define BLAS_INT int
 #endif
 

From 37773731050010d84b1d05ba0bd882d4d42a0b7a Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Fri, 3 Mar 2017 17:02:42 -0500
Subject: [PATCH 17/71] uninitialized variable

---
 lib/TH/cmake/FindSSE.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/TH/cmake/FindSSE.cmake b/lib/TH/cmake/FindSSE.cmake
index f84ce89fb..a14abe8d4 100644
--- a/lib/TH/cmake/FindSSE.cmake
+++ b/lib/TH/cmake/FindSSE.cmake
@@ -73,7 +73,7 @@ SET(AVX2_CODE "
 
   int main()
   {
-    __m256i a;
+    __m256i a = {0};
     a = _mm256_abs_epi16(a);
     return 0;
   }

From 8c7f1889286ab93b2a902dec4d7233f4c1fa8a2a Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Thu, 9 Mar 2017 11:30:54 -0500
Subject: [PATCH 18/71] fix broken link, rephrase ugly sentence.

---
 doc/tensor.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/tensor.md b/doc/tensor.md
index 5809dc186..fa61812e8 100644
--- a/doc/tensor.md
+++ b/doc/tensor.md
@@ -4,14 +4,14 @@
 The `Tensor` class is probably the most important class in
 `Torch`. Almost every package depends on this class. It is *__the__*
 class for handling numeric data. As with   pretty much anything in
-[Torch7](./../index.md), tensors are
+[Torch7](./index.md), tensors are
 [serializable](file.md#torch.File.serialization).
 
 __Multi-dimensional matrix__
 
-A `Tensor` is a potentially multi-dimensional matrix. The number of
-dimensions is unlimited that can be created using
-[LongStorage](storage.md) with more dimensions.
+A `Tensor` is a multi-dimensional matrix. The number of
+dimensions is unlimited (up to what can be created using
+[LongStorage](storage.md)).
 
 Example:
 ```lua

From 211a32191d16335c839d07c97deb9d854a596d6b Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Fri, 17 Mar 2017 14:44:27 -0400
Subject: [PATCH 19/71] Update THStorage.c

---
 lib/TH/generic/THStorage.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/lib/TH/generic/THStorage.c b/lib/TH/generic/THStorage.c
index a592cfb62..dfd5f629f 100644
--- a/lib/TH/generic/THStorage.c
+++ b/lib/TH/generic/THStorage.c
@@ -109,22 +109,29 @@ void THStorage_(retain)(THStorage *storage)
 
 void THStorage_(free)(THStorage *storage)
 {
-  if(!storage)
-    return;
+  printf("THStorage: begin\n");
+  if(!storage) return;
+  printf("THStorage: flag=%d atomicrefcnt=%d\n", storage->flag, THAtomicGet(&storage->refcount));
 
   if((storage->flag & TH_STORAGE_REFCOUNTED) && (THAtomicGet(&storage->refcount) > 0))
   {
+    printf("THStorage: 1-refcnt=%d\n", storage->refcount);
     if(THAtomicDecrementRef(&storage->refcount))
     {
+      printf("THStorage: 2-refcnt=%d flag=%d\n", storage->refcount, storage->flag);
       if(storage->flag & TH_STORAGE_FREEMEM) {
+        printf("THStorage: calling free!");
         storage->allocator->free(storage->allocatorContext, storage->data);
+        printf("THStorage: after calling free!");
       }
       if(storage->flag & TH_STORAGE_VIEW) {
         THStorage_(free)(storage->view);
       }
       THFree(storage);
+      printf("THStorage: after storage free!");
     }
   }
+  printf("THStorage: at end");	
 }
 
 THStorage* THStorage_(newWithData)(real *data, ptrdiff_t size)

From 281699f4f987015e42b57cdf0391127929bc9ee8 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Fri, 17 Mar 2017 14:49:46 -0400
Subject: [PATCH 20/71] Update THStorage.c

---
 lib/TH/generic/THStorage.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/TH/generic/THStorage.c b/lib/TH/generic/THStorage.c
index dfd5f629f..5c16e456a 100644
--- a/lib/TH/generic/THStorage.c
+++ b/lib/TH/generic/THStorage.c
@@ -120,18 +120,18 @@ void THStorage_(free)(THStorage *storage)
     {
       printf("THStorage: 2-refcnt=%d flag=%d\n", storage->refcount, storage->flag);
       if(storage->flag & TH_STORAGE_FREEMEM) {
-        printf("THStorage: calling free!");
+        printf("THStorage: calling free!\n");
         storage->allocator->free(storage->allocatorContext, storage->data);
-        printf("THStorage: after calling free!");
+        printf("THStorage: after calling free!\n");
       }
       if(storage->flag & TH_STORAGE_VIEW) {
         THStorage_(free)(storage->view);
       }
       THFree(storage);
-      printf("THStorage: after storage free!");
+      printf("THStorage: after storage free!\n");
     }
   }
-  printf("THStorage: at end");	
+  printf("THStorage: at end\n");	
 }
 
 THStorage* THStorage_(newWithData)(real *data, ptrdiff_t size)

From d255c20cfaa56602557e9025da2302c5b71c54aa Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Fri, 17 Mar 2017 16:39:16 -0400
Subject: [PATCH 21/71] Update THStorage.c

---
 lib/TH/generic/THStorage.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lib/TH/generic/THStorage.c b/lib/TH/generic/THStorage.c
index 5c16e456a..5836828ae 100644
--- a/lib/TH/generic/THStorage.c
+++ b/lib/TH/generic/THStorage.c
@@ -111,14 +111,16 @@ void THStorage_(free)(THStorage *storage)
 {
   printf("THStorage: begin\n");
   if(!storage) return;
-  printf("THStorage: flag=%d atomicrefcnt=%d\n", storage->flag, THAtomicGet(&storage->refcount));
+  //printf("THStorage: flag=%d atomicrefcnt=%d\n", storage->flag, THAtomicGet(&storage->refcount));
 
   if((storage->flag & TH_STORAGE_REFCOUNTED) && (THAtomicGet(&storage->refcount) > 0))
   {
-    printf("THStorage: 1-refcnt=%d\n", storage->refcount);
+    //printf("THStorage: 1-refcnt=%d\n", storage->refcount);
+    printf("THStorage: 1\n");
     if(THAtomicDecrementRef(&storage->refcount))
     {
-      printf("THStorage: 2-refcnt=%d flag=%d\n", storage->refcount, storage->flag);
+      //printf("THStorage: 2-refcnt=%d flag=%d\n", storage->refcount, storage->flag);
+      printf("THStorage: 2\n");
       if(storage->flag & TH_STORAGE_FREEMEM) {
         printf("THStorage: calling free!\n");
         storage->allocator->free(storage->allocatorContext, storage->data);

From 3ea659523f30e91581030766310cc59a61fe5685 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Fri, 17 Mar 2017 16:59:24 -0400
Subject: [PATCH 22/71] Update THStorage.c

---
 lib/TH/generic/THStorage.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/lib/TH/generic/THStorage.c b/lib/TH/generic/THStorage.c
index 5836828ae..c1cef4187 100644
--- a/lib/TH/generic/THStorage.c
+++ b/lib/TH/generic/THStorage.c
@@ -109,31 +109,31 @@ void THStorage_(retain)(THStorage *storage)
 
 void THStorage_(free)(THStorage *storage)
 {
-  printf("THStorage: begin\n");
+  //printf("THStorage: begin\n");
   if(!storage) return;
   //printf("THStorage: flag=%d atomicrefcnt=%d\n", storage->flag, THAtomicGet(&storage->refcount));
 
   if((storage->flag & TH_STORAGE_REFCOUNTED) && (THAtomicGet(&storage->refcount) > 0))
   {
     //printf("THStorage: 1-refcnt=%d\n", storage->refcount);
-    printf("THStorage: 1\n");
+    //printf("THStorage: 1\n");
     if(THAtomicDecrementRef(&storage->refcount))
     {
       //printf("THStorage: 2-refcnt=%d flag=%d\n", storage->refcount, storage->flag);
-      printf("THStorage: 2\n");
+      //printf("THStorage: 2\n");
       if(storage->flag & TH_STORAGE_FREEMEM) {
-        printf("THStorage: calling free!\n");
+        //printf("THStorage: calling free!\n");
         storage->allocator->free(storage->allocatorContext, storage->data);
-        printf("THStorage: after calling free!\n");
+        //printf("THStorage: after calling free!\n");
       }
       if(storage->flag & TH_STORAGE_VIEW) {
         THStorage_(free)(storage->view);
       }
       THFree(storage);
-      printf("THStorage: after storage free!\n");
+      //printf("THStorage: after storage free!\n");
     }
   }
-  printf("THStorage: at end\n");	
+  //printf("THStorage: at end\n");	
 }
 
 THStorage* THStorage_(newWithData)(real *data, ptrdiff_t size)

From e6999a40d7bb3d054ed09c1d7684d0403e199e2d Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Fri, 17 Mar 2017 17:16:51 -0400
Subject: [PATCH 23/71] Update THStorage.c

---
 lib/TH/generic/THStorage.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/lib/TH/generic/THStorage.c b/lib/TH/generic/THStorage.c
index c1cef4187..bdf112346 100644
--- a/lib/TH/generic/THStorage.c
+++ b/lib/TH/generic/THStorage.c
@@ -109,31 +109,31 @@ void THStorage_(retain)(THStorage *storage)
 
 void THStorage_(free)(THStorage *storage)
 {
-  //printf("THStorage: begin\n");
+  printf("THStorage: begin\n");
   if(!storage) return;
-  //printf("THStorage: flag=%d atomicrefcnt=%d\n", storage->flag, THAtomicGet(&storage->refcount));
+  printf("THStorage: flag=%d atomicrefcnt=%d\n", storage->flag, THAtomicGet(&storage->refcount));
 
   if((storage->flag & TH_STORAGE_REFCOUNTED) && (THAtomicGet(&storage->refcount) > 0))
   {
-    //printf("THStorage: 1-refcnt=%d\n", storage->refcount);
+    printf("THStorage: 1-refcnt=%d\n", storage->refcount);
     //printf("THStorage: 1\n");
     if(THAtomicDecrementRef(&storage->refcount))
     {
-      //printf("THStorage: 2-refcnt=%d flag=%d\n", storage->refcount, storage->flag);
+      printf("THStorage: 2-refcnt=%d flag=%d\n", storage->refcount, storage->flag);
       //printf("THStorage: 2\n");
       if(storage->flag & TH_STORAGE_FREEMEM) {
-        //printf("THStorage: calling free!\n");
+        printf("THStorage: calling free!\n");
         storage->allocator->free(storage->allocatorContext, storage->data);
-        //printf("THStorage: after calling free!\n");
+        printf("THStorage: after calling free!\n");
       }
       if(storage->flag & TH_STORAGE_VIEW) {
         THStorage_(free)(storage->view);
       }
       THFree(storage);
-      //printf("THStorage: after storage free!\n");
+      printf("THStorage: after storage free!\n");
     }
   }
-  //printf("THStorage: at end\n");	
+  printf("THStorage: at end\n");	
 }
 
 THStorage* THStorage_(newWithData)(real *data, ptrdiff_t size)

From f896da1c612d28a9a4105d90c1ddc3ac8e61ef0c Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Fri, 17 Mar 2017 18:02:08 -0400
Subject: [PATCH 24/71] Update THStorage.c

---
 lib/TH/generic/THStorage.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/lib/TH/generic/THStorage.c b/lib/TH/generic/THStorage.c
index bdf112346..a592cfb62 100644
--- a/lib/TH/generic/THStorage.c
+++ b/lib/TH/generic/THStorage.c
@@ -109,31 +109,22 @@ void THStorage_(retain)(THStorage *storage)
 
 void THStorage_(free)(THStorage *storage)
 {
-  printf("THStorage: begin\n");
-  if(!storage) return;
-  printf("THStorage: flag=%d atomicrefcnt=%d\n", storage->flag, THAtomicGet(&storage->refcount));
+  if(!storage)
+    return;
 
   if((storage->flag & TH_STORAGE_REFCOUNTED) && (THAtomicGet(&storage->refcount) > 0))
   {
-    printf("THStorage: 1-refcnt=%d\n", storage->refcount);
-    //printf("THStorage: 1\n");
     if(THAtomicDecrementRef(&storage->refcount))
     {
-      printf("THStorage: 2-refcnt=%d flag=%d\n", storage->refcount, storage->flag);
-      //printf("THStorage: 2\n");
       if(storage->flag & TH_STORAGE_FREEMEM) {
-        printf("THStorage: calling free!\n");
         storage->allocator->free(storage->allocatorContext, storage->data);
-        printf("THStorage: after calling free!\n");
       }
       if(storage->flag & TH_STORAGE_VIEW) {
         THStorage_(free)(storage->view);
       }
       THFree(storage);
-      printf("THStorage: after storage free!\n");
     }
   }
-  printf("THStorage: at end\n");	
 }
 
 THStorage* THStorage_(newWithData)(real *data, ptrdiff_t size)

From a05a3d9cb2166fad46f1f61eb8c618a80cdb0b91 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Fri, 17 Mar 2017 18:44:18 -0400
Subject: [PATCH 25/71] missing dllexport for Windows!

---
 lib/TH/vector/AVX.h | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/lib/TH/vector/AVX.h b/lib/TH/vector/AVX.h
index bfaeaa6b0..9fbbd8439 100644
--- a/lib/TH/vector/AVX.h
+++ b/lib/TH/vector/AVX.h
@@ -3,21 +3,21 @@
 
 #include <stddef.h>
 
-void THDoubleVector_copy_AVX(double *y, const double *x, const ptrdiff_t n);
-void THDoubleVector_fill_AVX(double *x, const double c, const ptrdiff_t n);
-void THDoubleVector_cdiv_AVX(double *z, const double *x, const double *y, const ptrdiff_t n);
-void THDoubleVector_divs_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
-void THDoubleVector_cmul_AVX(double *z, const double *x, const double *y, const ptrdiff_t n);
-void THDoubleVector_muls_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
-void THDoubleVector_cadd_AVX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n);
-void THDoubleVector_adds_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
-void THFloatVector_copy_AVX(float *y, const float *x, const ptrdiff_t n);
-void THFloatVector_fill_AVX(float *x, const float c, const ptrdiff_t n);
-void THFloatVector_cdiv_AVX(float *z, const float *x, const float *y, const ptrdiff_t n);
-void THFloatVector_divs_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
-void THFloatVector_cmul_AVX(float *z, const float *x, const float *y, const ptrdiff_t n);
-void THFloatVector_muls_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
-void THFloatVector_cadd_AVX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n);
-void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
+TH_API void THDoubleVector_copy_AVX(double *y, const double *x, const ptrdiff_t n);
+TH_API void THDoubleVector_fill_AVX(double *x, const double c, const ptrdiff_t n);
+TH_API void THDoubleVector_cdiv_AVX(double *z, const double *x, const double *y, const ptrdiff_t n);
+TH_API void THDoubleVector_divs_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
+TH_API void THDoubleVector_cmul_AVX(double *z, const double *x, const double *y, const ptrdiff_t n);
+TH_API void THDoubleVector_muls_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
+TH_API void THDoubleVector_cadd_AVX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n);
+TH_API void THDoubleVector_adds_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
+TH_API void THFloatVector_copy_AVX(float *y, const float *x, const ptrdiff_t n);
+TH_API void THFloatVector_fill_AVX(float *x, const float c, const ptrdiff_t n);
+TH_API void THFloatVector_cdiv_AVX(float *z, const float *x, const float *y, const ptrdiff_t n);
+TH_API void THFloatVector_divs_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
+TH_API void THFloatVector_cmul_AVX(float *z, const float *x, const float *y, const ptrdiff_t n);
+TH_API void THFloatVector_muls_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
+TH_API void THFloatVector_cadd_AVX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n);
+TH_API void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
 
 #endif

From 1b94d523a0bf7623dd60579a3afa5761c71052e0 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Fri, 17 Mar 2017 18:45:25 -0400
Subject: [PATCH 26/71] missing dllexport for Windows!

---
 lib/TH/vector/AVX2.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/TH/vector/AVX2.h b/lib/TH/vector/AVX2.h
index 85a9e93ee..b9d2a3433 100644
--- a/lib/TH/vector/AVX2.h
+++ b/lib/TH/vector/AVX2.h
@@ -3,7 +3,7 @@
 
 #include <stddef.h>
 
-void THDoubleVector_cadd_AVX2(double *z, const double *x, const double *y, const double c, const ptrdiff_t n);
-void THFloatVector_cadd_AVX2(float *z, const float *x, const float *y, const float c, const ptrdiff_t n);
+TH_API void THDoubleVector_cadd_AVX2(double *z, const double *x, const double *y, const double c, const ptrdiff_t n);
+TH_API void THFloatVector_cadd_AVX2(float *z, const float *x, const float *y, const float c, const ptrdiff_t n);
 
 #endif

From 97c39e515abd3df549c80aa155e7abc988fefe64 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Mon, 20 Mar 2017 10:35:43 -0400
Subject: [PATCH 27/71] missing include

---
 lib/TH/vector/AVX.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/TH/vector/AVX.h b/lib/TH/vector/AVX.h
index 9fbbd8439..355dc2fb6 100644
--- a/lib/TH/vector/AVX.h
+++ b/lib/TH/vector/AVX.h
@@ -2,6 +2,7 @@
 #define TH_AVX_H
 
 #include <stddef.h>
+#include "THGeneral.h"
 
 TH_API void THDoubleVector_copy_AVX(double *y, const double *x, const ptrdiff_t n);
 TH_API void THDoubleVector_fill_AVX(double *x, const double c, const ptrdiff_t n);

From ca226c459a7051cba1e58b9671ab56bb006d92f3 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Mon, 20 Mar 2017 10:36:03 -0400
Subject: [PATCH 28/71] missing include

---
 lib/TH/vector/AVX2.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/TH/vector/AVX2.h b/lib/TH/vector/AVX2.h
index b9d2a3433..724336baf 100644
--- a/lib/TH/vector/AVX2.h
+++ b/lib/TH/vector/AVX2.h
@@ -2,6 +2,7 @@
 #define TH_AVX2_H
 
 #include <stddef.h>
+#include "THGeneral.h"
 
 TH_API void THDoubleVector_cadd_AVX2(double *z, const double *x, const double *y, const double c, const ptrdiff_t n);
 TH_API void THFloatVector_cadd_AVX2(float *z, const float *x, const float *y, const float c, const ptrdiff_t n);

From 42ec69f97e3e92c8e37ae202e1e7f73d1bfa8261 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Mon, 20 Mar 2017 11:08:59 -0400
Subject: [PATCH 29/71] Update AVX.c

---
 lib/TH/vector/AVX.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/TH/vector/AVX.c b/lib/TH/vector/AVX.c
index b7d5dd1d6..a9faecdaa 100644
--- a/lib/TH/vector/AVX.c
+++ b/lib/TH/vector/AVX.c
@@ -1,6 +1,7 @@
 #if defined(__AVX__)
 #ifndef _MSC_VER
 #include <x86intrin.h>
+#pragma message("----> Compiling with AVX")
 #else
 #include <intrin.h>
 #endif

From db90eaa3ee45f32b4ef7e4cf2ddae59d8d82d198 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Mon, 20 Mar 2017 11:20:54 -0400
Subject: [PATCH 30/71] remove ifdef __AVX__

---
 lib/TH/vector/AVX.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/lib/TH/vector/AVX.c b/lib/TH/vector/AVX.c
index a9faecdaa..481036053 100644
--- a/lib/TH/vector/AVX.c
+++ b/lib/TH/vector/AVX.c
@@ -1,7 +1,5 @@
-#if defined(__AVX__)
 #ifndef _MSC_VER
 #include <x86intrin.h>
-#pragma message("----> Compiling with AVX")
 #else
 #include <intrin.h>
 #endif
@@ -272,4 +270,3 @@ void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdi
   }
 }
 
-#endif // defined(__AVX__)

From 259b2221b3c8d1e7b9e197d33874fd35c94e87c4 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Mon, 20 Mar 2017 11:21:25 -0400
Subject: [PATCH 31/71] remove ifdef __AVX2__

---
 lib/TH/vector/AVX2.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/lib/TH/vector/AVX2.c b/lib/TH/vector/AVX2.c
index 082a680ea..8affcfd86 100644
--- a/lib/TH/vector/AVX2.c
+++ b/lib/TH/vector/AVX2.c
@@ -1,4 +1,3 @@
-#if defined(__AVX2__)
 #ifndef _MSC_VER
 #include <x86intrin.h>
 #else
@@ -43,5 +42,3 @@ void THFloatVector_cadd_AVX2(float *z, const float *x, const float *y, const flo
     z[i] = x[i] + y[i] * c;
   }
 }
-
-#endif // defined(__AVX2__)

From 48da138d54738fbe1489939fc3043912f97686a7 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Mon, 3 Apr 2017 15:29:03 -0400
Subject: [PATCH 32/71] revert

---
 lib/TH/vector/AVX.h | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/lib/TH/vector/AVX.h b/lib/TH/vector/AVX.h
index 355dc2fb6..bfaeaa6b0 100644
--- a/lib/TH/vector/AVX.h
+++ b/lib/TH/vector/AVX.h
@@ -2,23 +2,22 @@
 #define TH_AVX_H
 
 #include <stddef.h>
-#include "THGeneral.h"
 
-TH_API void THDoubleVector_copy_AVX(double *y, const double *x, const ptrdiff_t n);
-TH_API void THDoubleVector_fill_AVX(double *x, const double c, const ptrdiff_t n);
-TH_API void THDoubleVector_cdiv_AVX(double *z, const double *x, const double *y, const ptrdiff_t n);
-TH_API void THDoubleVector_divs_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
-TH_API void THDoubleVector_cmul_AVX(double *z, const double *x, const double *y, const ptrdiff_t n);
-TH_API void THDoubleVector_muls_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
-TH_API void THDoubleVector_cadd_AVX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n);
-TH_API void THDoubleVector_adds_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
-TH_API void THFloatVector_copy_AVX(float *y, const float *x, const ptrdiff_t n);
-TH_API void THFloatVector_fill_AVX(float *x, const float c, const ptrdiff_t n);
-TH_API void THFloatVector_cdiv_AVX(float *z, const float *x, const float *y, const ptrdiff_t n);
-TH_API void THFloatVector_divs_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
-TH_API void THFloatVector_cmul_AVX(float *z, const float *x, const float *y, const ptrdiff_t n);
-TH_API void THFloatVector_muls_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
-TH_API void THFloatVector_cadd_AVX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n);
-TH_API void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
+void THDoubleVector_copy_AVX(double *y, const double *x, const ptrdiff_t n);
+void THDoubleVector_fill_AVX(double *x, const double c, const ptrdiff_t n);
+void THDoubleVector_cdiv_AVX(double *z, const double *x, const double *y, const ptrdiff_t n);
+void THDoubleVector_divs_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
+void THDoubleVector_cmul_AVX(double *z, const double *x, const double *y, const ptrdiff_t n);
+void THDoubleVector_muls_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
+void THDoubleVector_cadd_AVX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n);
+void THDoubleVector_adds_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
+void THFloatVector_copy_AVX(float *y, const float *x, const ptrdiff_t n);
+void THFloatVector_fill_AVX(float *x, const float c, const ptrdiff_t n);
+void THFloatVector_cdiv_AVX(float *z, const float *x, const float *y, const ptrdiff_t n);
+void THFloatVector_divs_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
+void THFloatVector_cmul_AVX(float *z, const float *x, const float *y, const ptrdiff_t n);
+void THFloatVector_muls_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
+void THFloatVector_cadd_AVX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n);
+void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
 
 #endif

From 0293729c0f45a56c62a1bea842a7eb4207a561e3 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Mon, 3 Apr 2017 17:11:05 -0400
Subject: [PATCH 33/71] revert

---
 lib/TH/vector/AVX.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/TH/vector/AVX.c b/lib/TH/vector/AVX.c
index 481036053..b7d5dd1d6 100644
--- a/lib/TH/vector/AVX.c
+++ b/lib/TH/vector/AVX.c
@@ -1,3 +1,4 @@
+#if defined(__AVX__)
 #ifndef _MSC_VER
 #include <x86intrin.h>
 #else
@@ -270,3 +271,4 @@ void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdi
   }
 }
 
+#endif // defined(__AVX__)

From 3fb5afc33c95e240597e9a2d36ccc2e2d462304f Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Mon, 3 Apr 2017 17:16:15 -0400
Subject: [PATCH 34/71] update from master

---
 lib/TH/CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/TH/CMakeLists.txt b/lib/TH/CMakeLists.txt
index 8aeb204c9..9e2256b99 100644
--- a/lib/TH/CMakeLists.txt
+++ b/lib/TH/CMakeLists.txt
@@ -25,8 +25,8 @@ ENDIF()
 ######################################################################
 
 IF(MSVC)
-  # MSVC now supports C99 since VS2013/VS2015
-  SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /std:c99")
+  # MSVC now supports C99 since VS2013/VS2015, however the standard version switch is not provided yet
+  # SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /std:c99")
 ELSE(MSVC)
   # enable gnu99 and not c99 because we use
   # gnu extensions like posix_memalign
@@ -212,7 +212,7 @@ ENDIF(C_SSE4_1_FOUND AND C_SSE4_2_FOUND)
 IF(C_AVX_FOUND)
   IF(MSVC)
     SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_avx.c PROPERTIES COMPILE_FLAGS "/Ox /fp:fast ${C_AVX_FLAGS}")
-    SET_SOURCE_FILES_PROPERTIES(vector/AVX.c PROPERTIES COMPILE_FLAGS "/Ox ${C_AVX_FLAGS}")
+    SET_SOURCE_FILES_PROPERTIES(vector/AVX.c PROPERTIES COMPILE_FLAGS "/Ox /arch:AVX ${C_AVX_FLAGS}")
   ELSE(MSVC)
     SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_avx.c PROPERTIES COMPILE_FLAGS "-O3 -ffast-math ${C_AVX_FLAGS}")
     SET_SOURCE_FILES_PROPERTIES(vector/AVX.c PROPERTIES COMPILE_FLAGS "-O3 ${C_AVX_FLAGS}")
@@ -222,7 +222,7 @@ ENDIF(C_AVX_FOUND)
 
 IF(C_AVX2_FOUND)
   IF(MSVC)
-    SET_SOURCE_FILES_PROPERTIES(vector/AVX2.c PROPERTIES COMPILE_FLAGS "/Ox ${C_AVX2_FLAGS}")
+    SET_SOURCE_FILES_PROPERTIES(vector/AVX2.c PROPERTIES COMPILE_FLAGS "/Ox /arch:AVX2 ${C_AVX2_FLAGS}")
   ELSE(MSVC)
     SET_SOURCE_FILES_PROPERTIES(vector/AVX2.c PROPERTIES COMPILE_FLAGS "-O3 ${C_AVX2_FLAGS}")
   ENDIF(MSVC)

From d3999bc32c84c1d41c129d14482ee5b873c9bbc6 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Mon, 3 Apr 2017 17:35:38 -0400
Subject: [PATCH 35/71] revert

---
 lib/TH/vector/AVX2.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lib/TH/vector/AVX2.h b/lib/TH/vector/AVX2.h
index 724336baf..85a9e93ee 100644
--- a/lib/TH/vector/AVX2.h
+++ b/lib/TH/vector/AVX2.h
@@ -2,9 +2,8 @@
 #define TH_AVX2_H
 
 #include <stddef.h>
-#include "THGeneral.h"
 
-TH_API void THDoubleVector_cadd_AVX2(double *z, const double *x, const double *y, const double c, const ptrdiff_t n);
-TH_API void THFloatVector_cadd_AVX2(float *z, const float *x, const float *y, const float c, const ptrdiff_t n);
+void THDoubleVector_cadd_AVX2(double *z, const double *x, const double *y, const double c, const ptrdiff_t n);
+void THFloatVector_cadd_AVX2(float *z, const float *x, const float *y, const float c, const ptrdiff_t n);
 
 #endif

From 2777b048968b8196edaa91444ab2ee1dee66942e Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Mon, 3 Apr 2017 17:36:16 -0400
Subject: [PATCH 36/71] revert

---
 lib/TH/vector/AVX2.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/TH/vector/AVX2.c b/lib/TH/vector/AVX2.c
index 8affcfd86..082a680ea 100644
--- a/lib/TH/vector/AVX2.c
+++ b/lib/TH/vector/AVX2.c
@@ -1,3 +1,4 @@
+#if defined(__AVX2__)
 #ifndef _MSC_VER
 #include <x86intrin.h>
 #else
@@ -42,3 +43,5 @@ void THFloatVector_cadd_AVX2(float *z, const float *x, const float *y, const flo
     z[i] = x[i] + y[i] * c;
   }
 }
+
+#endif // defined(__AVX2__)

From d92e185ab38431b3a6834eb2e0070a4098f6338f Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Mon, 8 May 2017 12:04:04 -0400
Subject: [PATCH 37/71] uninitialized variable

uninitialized variable makes program crash under Windows
---
 lib/TH/cmake/FindSSE.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/TH/cmake/FindSSE.cmake b/lib/TH/cmake/FindSSE.cmake
index f84ce89fb..a14abe8d4 100644
--- a/lib/TH/cmake/FindSSE.cmake
+++ b/lib/TH/cmake/FindSSE.cmake
@@ -73,7 +73,7 @@ SET(AVX2_CODE "
 
   int main()
   {
-    __m256i a;
+    __m256i a = {0};
     a = _mm256_abs_epi16(a);
     return 0;
   }

From f5fa7982054f87f6b200c8814f8049611c14139b Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Mon, 8 May 2017 12:14:04 -0400
Subject: [PATCH 38/71] fix dead link + rewrite sentence

---
 doc/tensor.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/tensor.md b/doc/tensor.md
index 5a7df0f73..d18af9977 100644
--- a/doc/tensor.md
+++ b/doc/tensor.md
@@ -4,14 +4,14 @@
 The `Tensor` class is probably the most important class in
 `Torch`. Almost every package depends on this class. It is *__the__*
 class for handling numeric data. As with   pretty much anything in
-[Torch7](./../index.md), tensors are
+[Torch7](./index.md), tensors are
 [serializable](file.md#torch.File.serialization).
 
 __Multi-dimensional matrix__
 
-A `Tensor` is a potentially multi-dimensional matrix. The number of
-dimensions is unlimited that can be created using
-[LongStorage](storage.md) with more dimensions.
+A `Tensor` is a multi-dimensional matrix. The number of
+dimensions is unlimited (up to what can be created using
+[LongStorage](storage.md)).
 
 Example:
 ```lua

From 122621014e989adc4381c78b6c09305ffb73978c Mon Sep 17 00:00:00 2001
From: ethanluoyc <ethanluoyc@gmail.com>
Date: Fri, 21 Apr 2017 00:24:14 +0100
Subject: [PATCH 39/71] Implement lgamma function.

---
 lib/TH/generic/THTensorMath.c | 1 +
 lib/TH/generic/THTensorMath.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/lib/TH/generic/THTensorMath.c b/lib/TH/generic/THTensorMath.c
index d6c510c4a..e12d2ac54 100644
--- a/lib/TH/generic/THTensorMath.c
+++ b/lib/TH/generic/THTensorMath.c
@@ -2721,6 +2721,7 @@ TENSOR_IMPLEMENT_LOGICAL_SUM(logicalany, ||, 0)
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
 
 LAB_IMPLEMENT_BASIC_FUNCTION(log,log)
+LAB_IMPLEMENT_BASIC_FUNCTION(lgamma,lgamma)
 LAB_IMPLEMENT_BASIC_FUNCTION(log1p,log1p)
 LAB_IMPLEMENT_BASIC_FUNCTION(sigmoid,TH_sigmoid)
 LAB_IMPLEMENT_BASIC_FUNCTION(exp,exp)
diff --git a/lib/TH/generic/THTensorMath.h b/lib/TH/generic/THTensorMath.h
index 63b15ae80..86d36e611 100644
--- a/lib/TH/generic/THTensorMath.h
+++ b/lib/TH/generic/THTensorMath.h
@@ -140,6 +140,7 @@ TH_API void THTensor_(abs)(THTensor *r_, THTensor *t);
 
 TH_API void THTensor_(sigmoid)(THTensor *r_, THTensor *t);
 TH_API void THTensor_(log)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(lgamma)(THTensor *r_, THTensor *t);
 TH_API void THTensor_(log1p)(THTensor *r_, THTensor *t);
 TH_API void THTensor_(exp)(THTensor *r_, THTensor *t);
 TH_API void THTensor_(cos)(THTensor *r_, THTensor *t);

From 97a97c2ebfe3aa8868c091af06cb0c579f7e5566 Mon Sep 17 00:00:00 2001
From: Gregory Chanan <gchanan@fb.com>
Date: Wed, 3 May 2017 11:03:58 -0700
Subject: [PATCH 40/71] Add a keepdim parameter for reduction functions over a
 single dimension.

By default, this parameter is False -- a backwards incompatible change, but
one that follows numpy semantics, e.g. numpy.sum (numpy names the parameter
"keepdims" since you can pass multiple dims to reduction functions).

The old behavior seems desired for normalization type operations
where the tensor will immediately be expanded out again, e.g.:
probs.sum(1).expand_as(probs)
which no longer works because the dimension to expand is missing.
This can be fixed by simply passing True as "keepdim" argument
to the reduction operation, e.g:
probs.sum(1, keepdim=True).expand_as(probs)
---
 lib/TH/generic/THTensorMath.c | 64 ++++++++++++++++++++++++++++-------
 lib/TH/generic/THTensorMath.h | 22 ++++++------
 2 files changed, 62 insertions(+), 24 deletions(-)

diff --git a/lib/TH/generic/THTensorMath.c b/lib/TH/generic/THTensorMath.c
index e12d2ac54..74d5cc54b 100644
--- a/lib/TH/generic/THTensorMath.c
+++ b/lib/TH/generic/THTensorMath.c
@@ -1485,7 +1485,7 @@ ptrdiff_t THTensor_(numel)(THTensor *t)
   return THTensor_(nElement)(t);
 }
 
-void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension)
+void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim)
 {
   THLongStorage *dim;
 
@@ -1554,9 +1554,14 @@ void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int
     THTensor_(free)(tempValues_);
     THLongTensor_free(tempIndices_);
   }
+
+  if (!keepdim) {
+    THTensor_(squeeze1d)(values_, values_, dimension);
+    THLongTensor_squeeze1d(indices_, indices_, dimension);
+  }
 }
 
-void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension)
+void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim)
 {
   THLongStorage *dim;
 
@@ -1622,10 +1627,15 @@ void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int
                             *tempIndices__data = *tempIndices__dimOffset;
                           });
   }
+
+  if (!keepdim) {
+    THTensor_(squeeze1d)(values_, values_, dimension);
+    THLongTensor_squeeze1d(indices_, indices_, dimension);
+  }
 }
 
 
-void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension)
+void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension, int keepdim)
 {
   THLongStorage *dim;
 
@@ -1655,9 +1665,13 @@ void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension)
     TH_TENSOR_APPLY2(real, temp_, real, t, *temp__data = *temp__data + *t_data;);
     THTensor_(free)(temp_);
   }
+
+  if (!keepdim) {
+    THTensor_(squeeze1d)(r_, r_, dimension);
+  }
 }
 
-void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension)
+void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension, int keepdim)
 {
   THLongStorage *dim;
 
@@ -1687,6 +1701,10 @@ void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension)
     TH_TENSOR_APPLY2(real, temp_, real, t, *temp__data = *temp__data * *t_data;);
     THTensor_(free)(temp_);
   }
+
+  if (!keepdim) {
+    THTensor_(squeeze1d)(r_, r_, dimension);
+  }
 }
 
 void THTensor_(cumsum)(THTensor *r_, THTensor *t, int dimension)
@@ -2255,7 +2273,7 @@ static void THTensor_(quickselect)(real *arr, long *idx, long k, long elements,
 #undef REAL_SWAP
 #undef BOTH_SWAP
 
-void THTensor_(mode)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension)
+void THTensor_(mode)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim)
 {
   THLongStorage *dim;
   THTensor *temp_;
@@ -2313,9 +2331,13 @@ void THTensor_(mode)(THTensor *values_, THLongTensor *indices_, THTensor *t, int
 
   THTensor_(free)(temp_);
   THLongTensor_free(tempi_);
+  if (!keepdim) {
+    THTensor_(squeeze1d)(values_, values_, dimension);
+    THLongTensor_squeeze1d(indices_, indices_, dimension);
+  }
 }
 
-void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t, long k, int dimension)
+void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t, long k, int dimension, int keepdim)
 {
   THLongStorage *dim;
   THTensor *temp_;
@@ -2355,9 +2377,13 @@ void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t,
 
   THTensor_(free)(temp_);
   THLongTensor_free(tempi_);
+  if (!keepdim) {
+    THTensor_(squeeze1d)(values_, values_, dimension);
+    THLongTensor_squeeze1d(indices_, indices_, dimension);
+  }
 }
 
-void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension)
+void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim)
 {
   long t_size_dim, k;
 
@@ -2366,7 +2392,7 @@ void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, i
   t_size_dim = THTensor_(size)(t, dimension);
   k = (t_size_dim-1) >> 1; /* take middle or one-before-middle element */
 
-  THTensor_(kthvalue)(values_, indices_, t, k+1, dimension);
+  THTensor_(kthvalue)(values_, indices_, t, k+1, dimension, keepdim);
 }
 
 void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, long k, int dim, int dir, int sorted)
@@ -2759,16 +2785,16 @@ void THTensor_(lerp)(THTensor *r_, THTensor *a, THTensor *b, real weight)
   TH_TENSOR_APPLY3(real, r_, real, a, real, b, *r__data = TH_lerp(*a_data, *b_data, weight););
 }
 
-void THTensor_(mean)(THTensor *r_, THTensor *t, int dimension)
+void THTensor_(mean)(THTensor *r_, THTensor *t, int dimension, int keepdim)
 {
   THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "invalid dimension %d",
       dimension + TH_INDEX_BASE);
 
-  THTensor_(sum)(r_, t, dimension);
+  THTensor_(sum)(r_, t, dimension, keepdim);
   THTensor_(div)(r_, r_, t->size[dimension]);
 }
 
-void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int flag)
+void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int flag, int keepdim)
 {
   THLongStorage *dim;
 
@@ -2807,9 +2833,13 @@ void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int flag)
                          sum2 = (sum2 < 0 ? 0 : sum2);
                          *r__data = (real)sqrt(sum2);
                        });
+
+  if (!keepdim) {
+    THTensor_(squeeze1d)(r_, r_, dimension);
+  }
 }
 
-void THTensor_(var)(THTensor *r_, THTensor *t, int dimension, int flag)
+void THTensor_(var)(THTensor *r_, THTensor *t, int dimension, int flag, int keepdim)
 {
   THLongStorage *dim;
 
@@ -2848,9 +2878,13 @@ void THTensor_(var)(THTensor *r_, THTensor *t, int dimension, int flag)
                          sum2 = (sum2 < 0 ? 0 : sum2);
                          *r__data = (real)sum2;
                        });
+
+  if (!keepdim) {
+    THTensor_(squeeze1d)(r_, r_, dimension);
+  }
 }
 
-void THTensor_(norm)(THTensor *r_, THTensor *t, real value, int dimension)
+void THTensor_(norm)(THTensor *r_, THTensor *t, real value, int dimension, int keepdim)
 {
   THLongStorage *dim;
 
@@ -2877,6 +2911,10 @@ void THTensor_(norm)(THTensor *r_, THTensor *t, real value, int dimension)
                            sum += pow(fabs(t_data[i*t_stride]), value);
                          *r__data = pow(sum, 1.0/value);)
   }
+
+  if (!keepdim) {
+    THTensor_(squeeze1d)(r_, r_, dimension);
+  }
 }
 
 accreal THTensor_(normall)(THTensor *tensor, real value)
diff --git a/lib/TH/generic/THTensorMath.h b/lib/TH/generic/THTensorMath.h
index 86d36e611..a3cf4107e 100644
--- a/lib/TH/generic/THTensorMath.h
+++ b/lib/TH/generic/THTensorMath.h
@@ -69,13 +69,13 @@ TH_API void THTensor_(baddbmm)(THTensor *r_, real beta, THTensor *t, real alpha,
 TH_API void THTensor_(match)(THTensor *r_, THTensor *m1, THTensor *m2, real gain);
 
 TH_API ptrdiff_t THTensor_(numel)(THTensor *t);
-TH_API void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension);
-TH_API void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension);
-TH_API void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t, long k, int dimension);
-TH_API void THTensor_(mode)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension);
-TH_API void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension);
-TH_API void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension);
-TH_API void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension);
+TH_API void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim);
+TH_API void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim);
+TH_API void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t, long k, int dimension, int keepdim);
+TH_API void THTensor_(mode)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim);
+TH_API void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim);
+TH_API void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension, int keepdim);
+TH_API void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension, int keepdim);
 TH_API void THTensor_(cumsum)(THTensor *r_, THTensor *t, int dimension);
 TH_API void THTensor_(cumprod)(THTensor *r_, THTensor *t, int dimension);
 TH_API void THTensor_(sign)(THTensor *r_, THTensor *t);
@@ -165,10 +165,10 @@ TH_API void THTensor_(trunc)(THTensor *r_, THTensor *t);
 TH_API void THTensor_(frac)(THTensor *r_, THTensor *t);
 TH_API void THTensor_(lerp)(THTensor *r_, THTensor *a, THTensor *b, real weight);
 
-TH_API void THTensor_(mean)(THTensor *r_, THTensor *t, int dimension);
-TH_API void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int flag);
-TH_API void THTensor_(var)(THTensor *r_, THTensor *t, int dimension, int flag);
-TH_API void THTensor_(norm)(THTensor *r_, THTensor *t, real value, int dimension);
+TH_API void THTensor_(mean)(THTensor *r_, THTensor *t, int dimension, int keepdim);
+TH_API void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int flag, int keepdim);
+TH_API void THTensor_(var)(THTensor *r_, THTensor *t, int dimension, int flag, int keepdim);
+TH_API void THTensor_(norm)(THTensor *r_, THTensor *t, real value, int dimension, int keepdim);
 TH_API void THTensor_(renorm)(THTensor *r_, THTensor *t, real value, int dimension, real maxnorm);
 TH_API accreal THTensor_(dist)(THTensor *a, THTensor *b, real value);
 TH_API void THTensor_(histc)(THTensor *hist, THTensor *tensor, long nbins, real minvalue, real maxvalue);

From aea5e732f07881d226aa189b3c5610a9eeac026e Mon Sep 17 00:00:00 2001
From: gchanan <gregchanan@gmail.com>
Date: Tue, 9 May 2017 17:44:36 -0400
Subject: [PATCH 41/71] Add keepdim to lua cwrap. (#1025)

---
 TensorMath.lua | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/TensorMath.lua b/TensorMath.lua
index 53838aefe..45e07c63e 100644
--- a/TensorMath.lua
+++ b/TensorMath.lua
@@ -616,7 +616,8 @@ for _,Tensor in ipairs({"ByteTensor", "CharTensor",
         cname("sum"),
         {{name=Tensor, default=true, returned=true},
          {name=Tensor},
-         {name="index"}})
+         {name="index"},
+         {name="boolean", default=true, invisible=true}})
 
    wrap("prod",
         cname("prodall"),
@@ -625,7 +626,8 @@ for _,Tensor in ipairs({"ByteTensor", "CharTensor",
         cname("prod"),
         {{name=Tensor, default=true, returned=true},
          {name=Tensor},
-         {name="index"}})
+         {name="index"},
+         {name="boolean", default=true, invisible=true}})
 
    for _,name in ipairs({"min", "max"}) do
       wrap(name,
@@ -636,7 +638,8 @@ for _,Tensor in ipairs({"ByteTensor", "CharTensor",
            {{name=Tensor, default=true, returned=true},
             {name="IndexTensor", default=true, returned=true, noreadadd=true},
             {name=Tensor},
-            {name="index"}})
+            {name="index"},
+            {name="boolean", default=true, invisible=true}})
    end
 
    for _,name in ipairs({"cmin", "cmax"}) do
@@ -719,21 +722,24 @@ wrap("topk",
          {name="IndexTensor", default=true, returned=true, noreadadd=true},
          {name=Tensor},
          {name="long"},
-         {name="index", default=lastdim(3)}})
+         {name="index", default=lastdim(3)},
+         {name="boolean", default=true, invisible=true}})
 
    wrap("mode",
        cname("mode"),
        {{name=Tensor, default=true, returned=true},
            {name="IndexTensor", default=true, returned=true, noreadadd=true},
            {name=Tensor},
-           {name="index", default=lastdim(3)}})
+           {name="index", default=lastdim(3)},
+           {name="boolean", default=true, invisible=true}})
 
    wrap("median",
         cname("median"),
         {{name=Tensor, default=true, returned=true},
          {name="IndexTensor", default=true, returned=true, noreadadd=true},
          {name=Tensor},
-         {name="index", default=lastdim(3)}})
+         {name="index", default=lastdim(3)},
+         {name="boolean", default=true, invisible=true}})
 
    wrap("tril",
         cname("tril"),
@@ -1083,7 +1089,8 @@ static void THTensor_random1__(THTensor *self, THGenerator *gen, long b)
            cname("mean"),
            {{name=Tensor, default=true, returned=true},
             {name=Tensor},
-            {name="index"}})
+            {name="index"},
+            {name="boolean", default=true, invisible=true}})
 
       for _,name in ipairs({"var", "std"}) do
          wrap(name,
@@ -1094,7 +1101,8 @@ static void THTensor_random1__(THTensor *self, THGenerator *gen, long b)
               {{name=Tensor, default=true, returned=true},
                {name=Tensor},
                {name="index"},
-               {name="boolean", default=false}})
+               {name="boolean", default=false},
+               {name="boolean", default=true, invisible=true}})
       end
       wrap("histc",
            cname("histc"),
@@ -1121,7 +1129,8 @@ static void THTensor_random1__(THTensor *self, THGenerator *gen, long b)
            {{name=Tensor, default=true, returned=true},
             {name=Tensor},
             {name=real},
-            {name="index"}})
+            {name="index"},
+            {name="boolean", default=true, invisible=true}})
 
       wrap("renorm",
            cname("renorm"),

From cba53b2bd877317f25aeaa2c5a1bd223824861b1 Mon Sep 17 00:00:00 2001
From: Pavan Yalamanchili <contact@pavanky.com>
Date: Thu, 11 May 2017 10:39:35 -0700
Subject: [PATCH 42/71] Ensuring float tensors call float versions of math
 functions

---
 lib/TH/THMath.h               | 17 ++++++-
 lib/TH/generic/THTensorMath.c | 89 ++++++++++++++++++++---------------
 2 files changed, 66 insertions(+), 40 deletions(-)

diff --git a/lib/TH/THMath.h b/lib/TH/THMath.h
index b96083f9a..004e4fe45 100644
--- a/lib/TH/THMath.h
+++ b/lib/TH/THMath.h
@@ -17,5 +17,20 @@ static inline double TH_lerp(double a, double b, double weight) {
   return a + weight * (b-a);
 }
 
-#endif // _THMATH_H
+static inline float TH_sigmoidf(float value) {
+  return 1.0f / (1.0f + expf(-value));
+}
+
+static inline float TH_fracf(float x) {
+  return x - truncf(x);
+}
+
+static inline float TH_rsqrtf(float x) {
+  return 1.0f / sqrtf(x);
+}
 
+static inline float TH_lerpf(float a, float b, float weight) {
+  return a + weight * (b-a);
+}
+
+#endif // _THMATH_H
diff --git a/lib/TH/generic/THTensorMath.c b/lib/TH/generic/THTensorMath.c
index 74d5cc54b..1dc1bc7f6 100644
--- a/lib/TH/generic/THTensorMath.c
+++ b/lib/TH/generic/THTensorMath.c
@@ -2746,43 +2746,50 @@ TENSOR_IMPLEMENT_LOGICAL_SUM(logicalany, ||, 0)
 /* floating point only now */
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
 
-LAB_IMPLEMENT_BASIC_FUNCTION(log,log)
-LAB_IMPLEMENT_BASIC_FUNCTION(lgamma,lgamma)
-LAB_IMPLEMENT_BASIC_FUNCTION(log1p,log1p)
-LAB_IMPLEMENT_BASIC_FUNCTION(sigmoid,TH_sigmoid)
-LAB_IMPLEMENT_BASIC_FUNCTION(exp,exp)
-LAB_IMPLEMENT_BASIC_FUNCTION(cos,cos)
-LAB_IMPLEMENT_BASIC_FUNCTION(acos,acos)
-LAB_IMPLEMENT_BASIC_FUNCTION(cosh,cosh)
-LAB_IMPLEMENT_BASIC_FUNCTION(sin,sin)
-LAB_IMPLEMENT_BASIC_FUNCTION(asin,asin)
-LAB_IMPLEMENT_BASIC_FUNCTION(sinh,sinh)
-LAB_IMPLEMENT_BASIC_FUNCTION(tan,tan)
-LAB_IMPLEMENT_BASIC_FUNCTION(atan,atan)
-LAB_IMPLEMENT_BASIC_FUNCTION(tanh,tanh)
-LAB_IMPLEMENT_BASIC_FUNCTION_VALUE(pow,pow)
-LAB_IMPLEMENT_BASIC_FUNCTION(sqrt,sqrt)
-LAB_IMPLEMENT_BASIC_FUNCTION(rsqrt,TH_rsqrt)
-LAB_IMPLEMENT_BASIC_FUNCTION(ceil,ceil)
-LAB_IMPLEMENT_BASIC_FUNCTION(floor,floor)
-LAB_IMPLEMENT_BASIC_FUNCTION(round,round)
-LAB_IMPLEMENT_BASIC_FUNCTION(abs,fabs)
-LAB_IMPLEMENT_BASIC_FUNCTION(trunc,trunc)
-LAB_IMPLEMENT_BASIC_FUNCTION(frac,TH_frac)
+#if defined (TH_REAL_IS_FLOAT)
+#define TH_MATH_NAME(fn) fn##f
+#else
+#define TH_MATH_NAME(fn) fn
+#endif
+
+LAB_IMPLEMENT_BASIC_FUNCTION(log,TH_MATH_NAME(log))
+LAB_IMPLEMENT_BASIC_FUNCTION(lgamma,TH_MATH_NAME(lgamma))
+LAB_IMPLEMENT_BASIC_FUNCTION(log1p,TH_MATH_NAME(log1p))
+LAB_IMPLEMENT_BASIC_FUNCTION(sigmoid,TH_MATH_NAME(TH_sigmoid))
+LAB_IMPLEMENT_BASIC_FUNCTION(exp,TH_MATH_NAME(exp))
+LAB_IMPLEMENT_BASIC_FUNCTION(cos,TH_MATH_NAME(cos))
+LAB_IMPLEMENT_BASIC_FUNCTION(acos,TH_MATH_NAME(acos))
+LAB_IMPLEMENT_BASIC_FUNCTION(cosh,TH_MATH_NAME(cosh))
+LAB_IMPLEMENT_BASIC_FUNCTION(sin,TH_MATH_NAME(sin))
+LAB_IMPLEMENT_BASIC_FUNCTION(asin,TH_MATH_NAME(asin))
+LAB_IMPLEMENT_BASIC_FUNCTION(sinh,TH_MATH_NAME(sinh))
+LAB_IMPLEMENT_BASIC_FUNCTION(tan,TH_MATH_NAME(tan))
+LAB_IMPLEMENT_BASIC_FUNCTION(atan,TH_MATH_NAME(atan))
+LAB_IMPLEMENT_BASIC_FUNCTION(tanh,TH_MATH_NAME(tanh))
+LAB_IMPLEMENT_BASIC_FUNCTION_VALUE(pow,TH_MATH_NAME(pow))
+LAB_IMPLEMENT_BASIC_FUNCTION(sqrt,TH_MATH_NAME(sqrt))
+LAB_IMPLEMENT_BASIC_FUNCTION(rsqrt,TH_MATH_NAME(TH_rsqrt))
+LAB_IMPLEMENT_BASIC_FUNCTION(ceil,TH_MATH_NAME(ceil))
+LAB_IMPLEMENT_BASIC_FUNCTION(floor,TH_MATH_NAME(floor))
+LAB_IMPLEMENT_BASIC_FUNCTION(round,TH_MATH_NAME(round))
+LAB_IMPLEMENT_BASIC_FUNCTION(abs,TH_MATH_NAME(fabs))
+LAB_IMPLEMENT_BASIC_FUNCTION(trunc,TH_MATH_NAME(trunc))
+LAB_IMPLEMENT_BASIC_FUNCTION(frac,TH_MATH_NAME(TH_frac))
 LAB_IMPLEMENT_BASIC_FUNCTION(neg,-)
-LAB_IMPLEMENT_BASIC_FUNCTION(cinv, 1.0 / )
+LAB_IMPLEMENT_BASIC_FUNCTION(cinv, TH_MATH_NAME(1.0) / )
+
 
 void THTensor_(atan2)(THTensor *r_, THTensor *tx, THTensor *ty)
 {
   THTensor_(resizeAs)(r_, tx);
-  TH_TENSOR_APPLY3(real, r_, real, tx, real, ty, *r__data = atan2(*tx_data,*ty_data););
+  TH_TENSOR_APPLY3(real, r_, real, tx, real, ty, *r__data = TH_MATH_NAME(atan2)(*tx_data,*ty_data););
 }
 
 void THTensor_(lerp)(THTensor *r_, THTensor *a, THTensor *b, real weight)
 {
   THArgCheck(THTensor_(nElement)(a) == THTensor_(nElement)(b), 2, "sizes do not match");
   THTensor_(resizeAs)(r_, a);
-  TH_TENSOR_APPLY3(real, r_, real, a, real, b, *r__data = TH_lerp(*a_data, *b_data, weight););
+  TH_TENSOR_APPLY3(real, r_, real, a, real, b, *r__data = TH_MATH_NAME(TH_lerp)(*a_data, *b_data, weight););
 }
 
 void THTensor_(mean)(THTensor *r_, THTensor *t, int dimension, int keepdim)
@@ -2823,7 +2830,7 @@ void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int flag, int keep
                          sum2 /= t_size;
                          sum2 -= sum*sum;
                          sum2 = (sum2 < 0 ? 0 : sum2);
-                         *r__data = (real)sqrt(sum2);
+                         *r__data = (real)TH_MATH_NAME(sqrt)(sum2);
                        }
                        else
                        {
@@ -2831,7 +2838,7 @@ void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int flag, int keep
                          sum2 /= t_size-1;
                          sum2 -= ((real)t_size)/((real)(t_size-1))*sum*sum;
                          sum2 = (sum2 < 0 ? 0 : sum2);
-                         *r__data = (real)sqrt(sum2);
+                         *r__data = (real)TH_MATH_NAME(sqrt)(sum2);
                        });
 
   if (!keepdim) {
@@ -2907,9 +2914,11 @@ void THTensor_(norm)(THTensor *r_, THTensor *t, real value, int dimension, int k
     TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
                          accreal sum = 0;
                          long i;
-                         for(i = 0; i < t_size; i++)
-                           sum += pow(fabs(t_data[i*t_stride]), value);
-                         *r__data = pow(sum, 1.0/value);)
+                         for(i = 0; i < t_size; i++) {
+                           sum += TH_MATH_NAME(pow)(
+                             TH_MATH_NAME(fabs)(t_data[i*t_stride]), value);
+                         }
+                         *r__data = TH_MATH_NAME(pow)(sum, 1.0/value);)
   }
 
   if (!keepdim) {
@@ -2924,14 +2933,14 @@ accreal THTensor_(normall)(THTensor *tensor, real value)
     TH_TENSOR_APPLY(real, tensor, sum += *tensor_data != 0.0;);
     return sum;
   } else if(value == 1) {
-    TH_TENSOR_APPLY(real, tensor, sum += fabs(*tensor_data););
+    TH_TENSOR_APPLY(real, tensor, sum += TH_MATH_NAME(fabs)(*tensor_data););
     return sum;
   } else if(value == 2) {
     TH_TENSOR_APPLY(real, tensor, accreal z = *tensor_data; sum += z*z;);
     return sqrt(sum);
   } else {
-    TH_TENSOR_APPLY(real, tensor, sum += pow(fabs(*tensor_data), value););
-    return pow(sum, 1.0/value);
+    TH_TENSOR_APPLY(real, tensor, sum += TH_MATH_NAME(pow)(TH_MATH_NAME(fabs)(*tensor_data), value););
+    return TH_MATH_NAME(pow)(sum, 1.0/value);
   }
 }
 
@@ -2963,7 +2972,7 @@ void THTensor_(renorm)(THTensor *res, THTensor *src, real value, int dimension,
     } else if (value == 2) {
       TH_TENSOR_APPLY(real, rowS, accreal z = *rowS_data; norm += z*z;);
     } else {
-      TH_TENSOR_APPLY(real, rowS, norm += pow(fabs(*rowS_data), value););
+      TH_TENSOR_APPLY(real, rowS, norm += TH_MATH_NAME(pow)(TH_MATH_NAME(fabs)(*rowS_data), value););
     }
 
     norm = pow(norm, 1/value);
@@ -2989,8 +2998,9 @@ accreal THTensor_(dist)(THTensor *tensor, THTensor *src, real value)
 {
   real sum = 0;
   TH_TENSOR_APPLY2(real, tensor, real, src,
-  sum += pow(fabs(*tensor_data - *src_data), value);)
-  return pow(sum, 1.0/value);
+                   sum += TH_MATH_NAME(pow)(
+                     TH_MATH_NAME(fabs)(*tensor_data - *src_data), value););
+  return TH_MATH_NAME(pow)(sum, 1.0/value);
 }
 
 accreal THTensor_(meanall)(THTensor *tensor)
@@ -3048,12 +3058,12 @@ void THTensor_(logspace)(THTensor *r_, real a, real b, long n)
 
   if(n == 1) {
     TH_TENSOR_APPLY(real, r_,
-        *r__data = pow(10.0, a);
+        *r__data = TH_MATH_NAME(pow)(10.0, a);
         i++;
         );
   } else {
     TH_TENSOR_APPLY(real, r_,
-        *r__data = pow(10.0, a + i*(b-a)/((real)(n-1)));
+        *r__data = TH_MATH_NAME(pow)(10.0, a + i*(b-a)/((real)(n-1)));
         i++;
         );
   }
@@ -3141,6 +3151,7 @@ void THTensor_(bhistc)(THTensor *hist, THTensor *tensor, long nbins, real minval
   );
 }
 
+#undef TH_MATH_NAME
 #endif /* floating point only part */
 #undef IS_NONZERO
 #endif

From 0da8686d1e918c2410c3219f1efdaa38ac23717e Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Fri, 19 May 2017 12:49:40 -0400
Subject: [PATCH 43/71] mkl 64 now installs in intel64

---
 lib/TH/cmake/FindMKL.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/TH/cmake/FindMKL.cmake b/lib/TH/cmake/FindMKL.cmake
index 7c9325a75..0ffe23f00 100644
--- a/lib/TH/cmake/FindMKL.cmake
+++ b/lib/TH/cmake/FindMKL.cmake
@@ -39,7 +39,7 @@ SET(INTEL_MKL_SEQUENTIAL OFF CACHE BOOL
 # Checks
 CHECK_TYPE_SIZE("void*" SIZE_OF_VOIDP)
 IF ("${SIZE_OF_VOIDP}" EQUAL 8)
-  SET(mklvers "em64t")
+  SET(mklvers "intel64")
   SET(iccvers "intel64")
   SET(mkl64s "_lp64")
 ELSE ("${SIZE_OF_VOIDP}" EQUAL 8)

From 0b95e88b3a08b01ee848f15b4211a0ce29306c13 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Fri, 19 May 2017 13:58:17 -0400
Subject: [PATCH 44/71] add debug message

---
 lib/TH/cmake/FindMKL.cmake | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/TH/cmake/FindMKL.cmake b/lib/TH/cmake/FindMKL.cmake
index 0ffe23f00..34f58ef27 100644
--- a/lib/TH/cmake/FindMKL.cmake
+++ b/lib/TH/cmake/FindMKL.cmake
@@ -89,6 +89,8 @@ IF (INTEL_MKL_DIR)
   ENDIF (MSVC)
 ENDIF (INTEL_MKL_DIR)
 
+MESSAGE(STATUS "Searching for MKL in ${CMAKE_LIBRARY_PATH} ...")
+
 # Try linking multiple libs
 MACRO(CHECK_ALL_LIBRARIES LIBRARIES _name _list _flags)
   # This macro checks for the existence of the combination of libraries given by _list.

From 7d3ae236de0d09d5f8293a7bfdc4719e8683dacb Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Fri, 19 May 2017 14:07:58 -0400
Subject: [PATCH 45/71] more debugging messages

---
 lib/TH/cmake/FindMKL.cmake | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lib/TH/cmake/FindMKL.cmake b/lib/TH/cmake/FindMKL.cmake
index 34f58ef27..1b772836f 100644
--- a/lib/TH/cmake/FindMKL.cmake
+++ b/lib/TH/cmake/FindMKL.cmake
@@ -28,6 +28,8 @@ SET(MKL_CDFT_LIBRARIES)
 INCLUDE(CheckTypeSize)
 INCLUDE(CheckFunctionExists)
 
+MESSAGE(STATUS "INTEL_MKL_DIR: ${INTEL_MKL_DIR}")
+
 # Intel Compiler Suite
 SET(INTEL_COMPILER_DIR CACHE STRING
   "Root directory of the Intel Compiler Suite (contains ipp, mkl, etc.)")
@@ -36,6 +38,8 @@ SET(INTEL_MKL_DIR CACHE STRING
 SET(INTEL_MKL_SEQUENTIAL OFF CACHE BOOL
   "Force using the sequential (non threaded) libraries")
 
+MESSAGE(STATUS "INTEL_MKL_DIR: ${INTEL_MKL_DIR}")
+
 # Checks
 CHECK_TYPE_SIZE("void*" SIZE_OF_VOIDP)
 IF ("${SIZE_OF_VOIDP}" EQUAL 8)

From dda382eba128dc6312748f6f9dce29a02a829547 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Fri, 19 May 2017 15:55:32 -0400
Subject: [PATCH 46/71] add default value to Intel MKL dirs from environment
 variables

---
 lib/TH/cmake/FindMKL.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/TH/cmake/FindMKL.cmake b/lib/TH/cmake/FindMKL.cmake
index 1b772836f..db3199162 100644
--- a/lib/TH/cmake/FindMKL.cmake
+++ b/lib/TH/cmake/FindMKL.cmake
@@ -31,9 +31,9 @@ INCLUDE(CheckFunctionExists)
 MESSAGE(STATUS "INTEL_MKL_DIR: ${INTEL_MKL_DIR}")
 
 # Intel Compiler Suite
-SET(INTEL_COMPILER_DIR CACHE STRING
+SET(INTEL_COMPILER_DIR $ENV{INTEL_COMPILER_DIR} CACHE STRING
   "Root directory of the Intel Compiler Suite (contains ipp, mkl, etc.)")
-SET(INTEL_MKL_DIR CACHE STRING
+SET(INTEL_MKL_DIR $ENV{INTEL_MKL_DIR} CACHE STRING
   "Root directory of the Intel MKL (standalone)")
 SET(INTEL_MKL_SEQUENTIAL OFF CACHE BOOL
   "Force using the sequential (non threaded) libraries")

From 49cdd2893044bc9e690ac177e26e64bece3cba5a Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Fri, 19 May 2017 16:54:15 -0400
Subject: [PATCH 47/71] tell cmake find_library about prefix and suffix

---
 lib/TH/cmake/FindMKL.cmake | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/lib/TH/cmake/FindMKL.cmake b/lib/TH/cmake/FindMKL.cmake
index db3199162..e99f66638 100644
--- a/lib/TH/cmake/FindMKL.cmake
+++ b/lib/TH/cmake/FindMKL.cmake
@@ -28,8 +28,6 @@ SET(MKL_CDFT_LIBRARIES)
 INCLUDE(CheckTypeSize)
 INCLUDE(CheckFunctionExists)
 
-MESSAGE(STATUS "INTEL_MKL_DIR: ${INTEL_MKL_DIR}")
-
 # Intel Compiler Suite
 SET(INTEL_COMPILER_DIR $ENV{INTEL_COMPILER_DIR} CACHE STRING
   "Root directory of the Intel Compiler Suite (contains ipp, mkl, etc.)")
@@ -93,6 +91,15 @@ IF (INTEL_MKL_DIR)
   ENDIF (MSVC)
 ENDIF (INTEL_MKL_DIR)
 
+# lib prefix
+IF (MSVC)
+  SET(CMAKE_FIND_LIBRARY_PREFIXES "")
+  SET(CMAKE_FIND_LIBRARY_SUFFIXES ".lib" ".dll")
+ELSE(MSVC)
+  SET(CMAKE_FIND_LIBRARY_PREFIXES "lib")
+  SET(CMAKE_FIND_LIBRARY_SUFFIXES ".so" ".a")
+ENDIF (MSVC)
+
 MESSAGE(STATUS "Searching for MKL in ${CMAKE_LIBRARY_PATH} ...")
 
 # Try linking multiple libs

From 345f39d82d3b1b39b975a8c65d45a2ebf1e70be9 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Fri, 19 May 2017 17:34:13 -0400
Subject: [PATCH 48/71] add trailing /

---
 lib/TH/cmake/FindMKL.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/TH/cmake/FindMKL.cmake b/lib/TH/cmake/FindMKL.cmake
index e99f66638..2e961b86a 100644
--- a/lib/TH/cmake/FindMKL.cmake
+++ b/lib/TH/cmake/FindMKL.cmake
@@ -82,12 +82,12 @@ ENDIF (INTEL_COMPILER_DIR)
 IF (INTEL_MKL_DIR)
   # TODO: diagnostic if dir does not exist
   SET(CMAKE_INCLUDE_PATH ${CMAKE_INCLUDE_PATH}
-    "${INTEL_MKL_DIR}/include")
+    "${INTEL_MKL_DIR}/include/")
   SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH}
-    "${INTEL_MKL_DIR}/lib/${mklvers}")
+    "${INTEL_MKL_DIR}/lib/${mklvers}/")
   IF (MSVC)
     SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH}
-      "${INTEL_MKL_DIR}/lib/${iccvers}")
+      "${INTEL_MKL_DIR}/lib/${iccvers}/")
   ENDIF (MSVC)
 ENDIF (INTEL_MKL_DIR)
 

From b778edbdf07d630e830e11153e0b6dbd2df99ff3 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Wed, 17 May 2017 17:55:45 -0400
Subject: [PATCH 49/71] fix 32/64 bits int issues with Windows

---
 lib/TH/cmake/FindBLAS.cmake |  22 +++++--
 lib/TH/cmake/FindMKL.cmake  |   2 +-
 lib/TH/generic/THBlas.c     | 114 +++++++++++++++++++-----------------
 3 files changed, 79 insertions(+), 59 deletions(-)

diff --git a/lib/TH/cmake/FindBLAS.cmake b/lib/TH/cmake/FindBLAS.cmake
index 2188fc724..ded8d5825 100644
--- a/lib/TH/cmake/FindBLAS.cmake
+++ b/lib/TH/cmake/FindBLAS.cmake
@@ -242,32 +242,44 @@ endif()
 # Determine if blas was compiled with the f2c conventions
 IF (BLAS_LIBRARIES)
   SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
+  
   CHECK_C_SOURCE_RUNS("
 #include <stdlib.h>
 #include <stdio.h>
 float x[4] = { 1, 2, 3, 4 };
 float y[4] = { .1, .01, .001, .0001 };
-int four = 4;
-int one = 1;
+#ifdef WIN32
+  typedef __int64 BLINT;
+#else
+  typedef long BLINT;
+#endif
+BLINT four = 4;
+BLINT one = 1;
 extern double sdot_();
 int main() {
-  int i;
   double r = sdot_(&four, x, &one, y, &one);
   exit((float)r != (float).1234);
 }" BLAS_F2C_DOUBLE_WORKS )
+
   CHECK_C_SOURCE_RUNS("
 #include <stdlib.h>
 #include <stdio.h>
 float x[4] = { 1, 2, 3, 4 };
 float y[4] = { .1, .01, .001, .0001 };
-int four = 4;
-int one = 1;
+#ifdef WIN32
+  typedef __int64 BLINT;
+#else
+  typedef long BLINT;
+#endif
+BLINT four = 4;
+BLINT one = 1;
 extern float sdot_();
 int main() {
   int i;
   double r = sdot_(&four, x, &one, y, &one);
   exit((float)r != (float).1234);
 }" BLAS_F2C_FLOAT_WORKS )
+
   IF (BLAS_F2C_DOUBLE_WORKS AND NOT BLAS_F2C_FLOAT_WORKS)
     MESSAGE(STATUS "This BLAS uses the F2C return conventions")
     SET(BLAS_F2C TRUE)
diff --git a/lib/TH/cmake/FindMKL.cmake b/lib/TH/cmake/FindMKL.cmake
index 2e961b86a..5d2e3bea0 100644
--- a/lib/TH/cmake/FindMKL.cmake
+++ b/lib/TH/cmake/FindMKL.cmake
@@ -43,7 +43,7 @@ CHECK_TYPE_SIZE("void*" SIZE_OF_VOIDP)
 IF ("${SIZE_OF_VOIDP}" EQUAL 8)
   SET(mklvers "intel64")
   SET(iccvers "intel64")
-  SET(mkl64s "_lp64")
+  SET(mkl64s "_ilp64")
 ELSE ("${SIZE_OF_VOIDP}" EQUAL 8)
   SET(mklvers "32")
   SET(iccvers "ia32")
diff --git a/lib/TH/generic/THBlas.c b/lib/TH/generic/THBlas.c
index b04931f34..195e65526 100644
--- a/lib/TH/generic/THBlas.c
+++ b/lib/TH/generic/THBlas.c
@@ -9,24 +9,37 @@
 # define ffloat float
 #endif
 
-TH_EXTERNC void dswap_(int *n, double *x, int *incx, double *y, int *incy);
-TH_EXTERNC void sswap_(int *n, float *x, int *incx, float *y, int *incy);
-TH_EXTERNC void dscal_(int *n, double *a, double *x, int *incx);
-TH_EXTERNC void sscal_(int *n, float *a, float *x, int *incx);
-TH_EXTERNC void dcopy_(int *n, double *x, int *incx, double *y, int *incy);
-TH_EXTERNC void scopy_(int *n, float *x, int *incx, float *y, int *incy);
-TH_EXTERNC void daxpy_(int *n, double *a, double *x, int *incx, double *y, int *incy);
-TH_EXTERNC void saxpy_(int *n, float *a, float *x, int *incx, float *y, int *incy);
-TH_EXTERNC double ddot_(int *n, double *x, int *incx, double *y, int *incy);
-TH_EXTERNC ffloat sdot_(int *n, float *x, int *incx, float *y, int *incy);
-TH_EXTERNC void dgemv_(char *trans, int *m, int *n, double *alpha, double *a, int *lda, double *x, int *incx, double *beta, double *y, int *incy);
-TH_EXTERNC void sgemv_(char *trans, int *m, int *n, float *alpha, float *a, int *lda, float *x, int *incx, float *beta, float *y, int *incy);
-TH_EXTERNC void dger_(int *m, int *n, double *alpha, double *x, int *incx, double *y, int *incy, double *a, int *lda);
-TH_EXTERNC void sger_(int *m, int *n, float *alpha, float *x, int *incx, float *y, int *incy, float *a, int *lda);
-TH_EXTERNC void dgemm_(char *transa, char *transb, int *m, int *n, int *k, double *alpha, double *a, int *lda, double *b, int *ldb, double *beta, double *c, int *ldc);
-TH_EXTERNC void sgemm_(char *transa, char *transb, int *m, int *n, int *k, float *alpha, float *a, int *lda, float *b, int *ldb, float *beta, float *c, int *ldc);
+// define MKL_LP64 to get 32bit ints on 64bit platforms
+#ifndef MKL_LP64 
+ // 64bit ints
+ #ifdef WIN32
+  #define BLAS_INT __int64 
+ #else
+  #define BLAS_INT long 
+ #endif
+#else
+ // 32bit ints
+ #define BLAS_INT int
+#endif
 
 
+TH_EXTERNC void dswap_(BLAS_INT *n, double *x, BLAS_INT *incx, double *y, BLAS_INT *incy);
+TH_EXTERNC void sswap_(BLAS_INT *n, float *x, BLAS_INT *incx, float *y, BLAS_INT *incy);
+TH_EXTERNC void dscal_(BLAS_INT *n, double *a, double *x, BLAS_INT *incx);
+TH_EXTERNC void sscal_(BLAS_INT *n, float *a, float *x, BLAS_INT *incx);
+TH_EXTERNC void dcopy_(BLAS_INT *n, double *x, BLAS_INT *incx, double *y, BLAS_INT *incy);
+TH_EXTERNC void scopy_(BLAS_INT *n, float *x, BLAS_INT *incx, float *y, BLAS_INT *incy);
+TH_EXTERNC void daxpy_(BLAS_INT *n, double *a, double *x, BLAS_INT *incx, double *y, BLAS_INT *incy);
+TH_EXTERNC void saxpy_(BLAS_INT *n, float *a, float *x, BLAS_INT *incx, float *y, BLAS_INT *incy);
+TH_EXTERNC double ddot_(BLAS_INT *n, double *x, BLAS_INT *incx, double *y, BLAS_INT *incy);
+TH_EXTERNC ffloat sdot_(BLAS_INT *n, float *x, BLAS_INT *incx, float *y, BLAS_INT *incy);
+TH_EXTERNC void dgemv_(char *trans, BLAS_INT *m, BLAS_INT *n, double *alpha, double *a, BLAS_INT *lda, double *x, BLAS_INT *incx, double *beta, double *y, BLAS_INT *incy);
+TH_EXTERNC void sgemv_(char *trans, BLAS_INT *m, BLAS_INT *n, float *alpha, float *a, BLAS_INT *lda, float *x, BLAS_INT *incx, float *beta, float *y, BLAS_INT *incy);
+TH_EXTERNC void dger_(BLAS_INT *m, BLAS_INT *n, double *alpha, double *x, BLAS_INT *incx, double *y, BLAS_INT *incy, double *a, BLAS_INT *lda);
+TH_EXTERNC void sger_(BLAS_INT *m, BLAS_INT *n, float *alpha, float *x, BLAS_INT *incx, float *y, BLAS_INT *incy, float *a, BLAS_INT *lda);
+TH_EXTERNC void dgemm_(char *transa, char *transb, BLAS_INT *m, BLAS_INT *n, BLAS_INT *k, double *alpha, double *a, BLAS_INT *lda, double *b, BLAS_INT *ldb, double *beta, double *c, BLAS_INT *ldc);
+TH_EXTERNC void sgemm_(char *transa, char *transb, BLAS_INT *m, BLAS_INT *n, BLAS_INT *k, float *alpha, float *a, BLAS_INT *lda, float *b, BLAS_INT *ldb, float *beta, float *c, BLAS_INT *ldc);
+ 
 
 void THBlas_(swap)(long n, real *x, long incx, real *y, long incy)
 {
@@ -39,9 +52,9 @@ void THBlas_(swap)(long n, real *x, long incx, real *y, long incy)
 #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
   if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
   {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
+    BLAS_INT i_n = (BLAS_INT)n;
+    BLAS_INT i_incx = (BLAS_INT)incx;
+    BLAS_INT i_incy = (BLAS_INT)incy;
 
 #if defined(TH_REAL_IS_DOUBLE)
     dswap_(&i_n, x, &i_incx, y, &i_incy);
@@ -70,8 +83,8 @@ void THBlas_(scal)(long n, real a, real *x, long incx)
 #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
   if( (n <= INT_MAX) && (incx <= INT_MAX) )
   {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
+    BLAS_INT i_n = (BLAS_INT)n;
+    BLAS_INT i_incx = (BLAS_INT)incx;
 
 #if defined(TH_REAL_IS_DOUBLE)
     dscal_(&i_n, &a, x, &i_incx);
@@ -83,13 +96,8 @@ void THBlas_(scal)(long n, real a, real *x, long incx)
 #endif
   {
     long i;
-    for(i = 0; i < n; i++) {
-      if (a == 0) {
-        x[i*incx] = 0;
-      } else {
-        x[i*incx] *= a;
-      }
-    }
+    for(i = 0; i < n; i++)
+      x[i*incx] *= a;
   }
 }
 
@@ -104,9 +112,9 @@ void THBlas_(copy)(long n, real *x, long incx, real *y, long incy)
 #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
   if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
   {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
+    BLAS_INT i_n = (BLAS_INT)n;
+    BLAS_INT i_incx = (BLAS_INT)incx;
+    BLAS_INT i_incy = (BLAS_INT)incy;
 
 #if defined(TH_REAL_IS_DOUBLE)
     dcopy_(&i_n, x, &i_incx, y, &i_incy);
@@ -134,9 +142,9 @@ void THBlas_(axpy)(long n, real a, real *x, long incx, real *y, long incy)
 #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
   if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
   {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
+    BLAS_INT i_n = (BLAS_INT)n;
+    BLAS_INT i_incx = (BLAS_INT)incx;
+    BLAS_INT i_incy = (BLAS_INT)incy;
 
 #if defined(TH_REAL_IS_DOUBLE)
     daxpy_(&i_n, &a, x, &i_incx, y, &i_incy);
@@ -164,9 +172,9 @@ real THBlas_(dot)(long n, real *x, long incx, real *y, long incy)
 #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
   if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
   {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
+    BLAS_INT i_n = (BLAS_INT)n;
+    BLAS_INT i_incx = (BLAS_INT)incx;
+    BLAS_INT i_incy = (BLAS_INT)incy;
 
 #if defined(TH_REAL_IS_DOUBLE)
     return (real) ddot_(&i_n, x, &i_incx, y, &i_incy);
@@ -195,11 +203,11 @@ void THBlas_(gemv)(char trans, long m, long n, real alpha, real *a, long lda, re
       (incx > 0) && (incx <= INT_MAX) &&
       (incy > 0) && (incy <= INT_MAX) )
   {
-    int i_m = (int)m;
-    int i_n = (int)n;
-    int i_lda = (int)lda;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
+    BLAS_INT i_m = (BLAS_INT)m;
+    BLAS_INT i_n = (BLAS_INT)n;
+    BLAS_INT i_lda = (BLAS_INT)lda;
+    BLAS_INT i_incx = (BLAS_INT)incx;
+    BLAS_INT i_incy = (BLAS_INT)incy;
 
 #if defined(TH_REAL_IS_DOUBLE)
     dgemv_(&trans, &i_m, &i_n, &alpha, a, &i_lda, x, &i_incx, &beta, y, &i_incy);
@@ -250,11 +258,11 @@ void THBlas_(ger)(long m, long n, real alpha, real *x, long incx, real *y, long
 #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
   if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX)  && (incx <= INT_MAX) && (incy <= INT_MAX) )
   {
-    int i_m = (int)m;
-    int i_n = (int)n;
-    int i_lda = (int)lda;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
+    BLAS_INT i_m = (BLAS_INT)m;
+    BLAS_INT i_n = (BLAS_INT)n;
+    BLAS_INT i_lda = (BLAS_INT)lda;
+    BLAS_INT i_incx = (BLAS_INT)incx;
+    BLAS_INT i_incy = (BLAS_INT)incy;
 
 #if defined(TH_REAL_IS_DOUBLE)
     dger_(&i_m, &i_n, &alpha, x, &i_incx, y, &i_incy, a, &i_lda);
@@ -309,12 +317,12 @@ void THBlas_(gemm)(char transa, char transb, long m, long n, long k, real alpha,
 #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
   if( (m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (lda <= INT_MAX)  && (ldb <= INT_MAX) && (ldc <= INT_MAX) )
   {
-    int i_m = (int)m;
-    int i_n = (int)n;
-    int i_k = (int)k;
-    int i_lda = (int)lda;
-    int i_ldb = (int)ldb;
-    int i_ldc = (int)ldc;
+    BLAS_INT i_m = (BLAS_INT)m;
+    BLAS_INT i_n = (BLAS_INT)n;
+    BLAS_INT i_k = (BLAS_INT)k;
+    BLAS_INT i_lda = (BLAS_INT)lda;
+    BLAS_INT i_ldb = (BLAS_INT)ldb;
+    BLAS_INT i_ldc = (BLAS_INT)ldc;
 
 #if defined(TH_REAL_IS_DOUBLE)
     dgemm_(&transa, &transb, &i_m, &i_n, &i_k, &alpha, a, &i_lda, b, &i_ldb, &beta, c, &i_ldc);

From 0da38078b9a5e0d64e18cdb0bf9e69a0e92c181d Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Wed, 17 May 2017 19:17:32 -0400
Subject: [PATCH 50/71] allow 32 or 64 bit MKL to be linked

add MKL_ILP64 flag in cmake and pass it to C to switch pointer types in
BLAS declarations
---
 lib/TH/THGeneral.h.in   |  1 +
 lib/TH/generic/THBlas.c | 16 +++++++---------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/lib/TH/THGeneral.h.in b/lib/TH/THGeneral.h.in
index de11f1b19..b86bad2e6 100644
--- a/lib/TH/THGeneral.h.in
+++ b/lib/TH/THGeneral.h.in
@@ -14,6 +14,7 @@
 #cmakedefine USE_BLAS
 #cmakedefine USE_LAPACK
 #cmakedefine BLAS_F2C
+#cmakedefine MKL_ILP64
 
 #ifdef __cplusplus
 # define TH_EXTERNC extern "C"
diff --git a/lib/TH/generic/THBlas.c b/lib/TH/generic/THBlas.c
index 195e65526..c9c6a73f2 100644
--- a/lib/TH/generic/THBlas.c
+++ b/lib/TH/generic/THBlas.c
@@ -9,17 +9,15 @@
 # define ffloat float
 #endif
 
-// define MKL_LP64 to get 32bit ints on 64bit platforms
-#ifndef MKL_LP64 
- // 64bit ints
- #ifdef WIN32
-  #define BLAS_INT __int64 
- #else
-  #define BLAS_INT long 
+// set BLAS interface according to MKL 32/64 bit
+#ifdef MKL_ILP64
+  #ifdef WIN32 
+    #define BLAS_INT __int64 
+  #else
+    #define BLAS_INT long 
  #endif
 #else
- // 32bit ints
- #define BLAS_INT int
+  #define BLAS_INT int
 #endif
 
 

From 0ee11e3d031dbb19fc22f9bcb801edcdc3f1040d Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Wed, 17 May 2017 19:18:10 -0400
Subject: [PATCH 51/71] add MKL_ILP64 flag in cmake

---
 lib/TH/cmake/FindMKL.cmake | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/lib/TH/cmake/FindMKL.cmake b/lib/TH/cmake/FindMKL.cmake
index 5d2e3bea0..9d738077f 100644
--- a/lib/TH/cmake/FindMKL.cmake
+++ b/lib/TH/cmake/FindMKL.cmake
@@ -271,9 +271,23 @@ ENDIF (MKL_LIBRARIES)
 IF(NOT MKL_FOUND AND MKL_FIND_REQUIRED)
   MESSAGE(FATAL_ERROR "MKL library not found. Please specify library  location")
 ENDIF(NOT MKL_FOUND AND MKL_FIND_REQUIRED)
+
+# set flag for 32/64 bit MKL
+IF(MKL_FOUND)
+	IF (mkl64s)
+		SET(MKL_ILP64 TRUE)
+	ELSE(mkl64)
+		SET(MKL_ILP64 FALSE)
+	ENDIF(mkl64)
+ENDIF (MKL_FOUND)
+
 IF(NOT MKL_FIND_QUIETLY)
   IF(MKL_FOUND)
-    MESSAGE(STATUS "MKL library found")
+	IF (mkl64s)
+	  MESSAGE(STATUS "MKL 64bit library found")
+	ELSE(mkl64)
+	  MESSAGE(STATUS "MKL 32bit library found")
+	ENDIF(mkl64)
   ELSE(MKL_FOUND)
     MESSAGE(STATUS "MKL library not found")
   ENDIF(MKL_FOUND)

From befd78dccd642a007b4196bbdeca5a37ed70ad5c Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Wed, 24 May 2017 14:10:19 -0400
Subject: [PATCH 52/71] allow link to MKL_ILP64 model

add LAPACK_INT to allow ILP^$ model
fix compiler warning about return in THTensorMath.c
---
 lib/TH/THLapack.h               |  11 ++
 lib/TH/cmake/FindMKL.cmake      |  14 +-
 lib/TH/generic/THBlas.c         |   6 +-
 lib/TH/generic/THLapack.c       |  97 +++++++------
 lib/TH/generic/THLapack.h       |  32 ++---
 lib/TH/generic/THTensorLapack.c | 236 +++++++++++++++++++++-----------
 lib/TH/generic/THTensorMath.c   |  24 ++--
 7 files changed, 251 insertions(+), 169 deletions(-)

diff --git a/lib/TH/THLapack.h b/lib/TH/THLapack.h
index 614d15f94..7f7155421 100644
--- a/lib/TH/THLapack.h
+++ b/lib/TH/THLapack.h
@@ -21,6 +21,17 @@ if (info < 0) {                                                     \
   THError(fmt, func, info, ##__VA_ARGS__);                          \
 }
 
+#ifdef MKL_ILP64
+// set  64 bit MKL integer type
+#if (!defined(__INTEL_COMPILER)) & defined(_MSC_VER) 
+#define LAPACK_INT __int64 
+#else
+#define LAPACK_INT long long int
+#endif
+#else
+#define LAPACK_INT int
+#endif
+
 #include "generic/THLapack.h"
 #include "THGenerateAllTypes.h"
 
diff --git a/lib/TH/cmake/FindMKL.cmake b/lib/TH/cmake/FindMKL.cmake
index 9d738077f..98b485948 100644
--- a/lib/TH/cmake/FindMKL.cmake
+++ b/lib/TH/cmake/FindMKL.cmake
@@ -43,7 +43,11 @@ CHECK_TYPE_SIZE("void*" SIZE_OF_VOIDP)
 IF ("${SIZE_OF_VOIDP}" EQUAL 8)
   SET(mklvers "intel64")
   SET(iccvers "intel64")
-  SET(mkl64s "_ilp64")
+  IF (MKL_ILP64)
+	SET(mkl64s "_ilp64")
+  ELSE(MKL_ILP64)
+	SET(mkl64s "_lp64")
+  ENDIF(MKL_ILP64)
 ELSE ("${SIZE_OF_VOIDP}" EQUAL 8)
   SET(mklvers "32")
   SET(iccvers "ia32")
@@ -272,14 +276,6 @@ IF(NOT MKL_FOUND AND MKL_FIND_REQUIRED)
   MESSAGE(FATAL_ERROR "MKL library not found. Please specify library  location")
 ENDIF(NOT MKL_FOUND AND MKL_FIND_REQUIRED)
 
-# set flag for 32/64 bit MKL
-IF(MKL_FOUND)
-	IF (mkl64s)
-		SET(MKL_ILP64 TRUE)
-	ELSE(mkl64)
-		SET(MKL_ILP64 FALSE)
-	ENDIF(mkl64)
-ENDIF (MKL_FOUND)
 
 IF(NOT MKL_FIND_QUIETLY)
   IF(MKL_FOUND)
diff --git a/lib/TH/generic/THBlas.c b/lib/TH/generic/THBlas.c
index c9c6a73f2..1f58060b5 100644
--- a/lib/TH/generic/THBlas.c
+++ b/lib/TH/generic/THBlas.c
@@ -9,12 +9,12 @@
 # define ffloat float
 #endif
 
-// set BLAS interface according to MKL 32/64 bit
 #ifdef MKL_ILP64
-  #ifdef WIN32 
+  // set  64 bit MKL integer type
+  #if (!defined(__INTEL_COMPILER)) & defined(_MSC_VER) 
     #define BLAS_INT __int64 
   #else
-    #define BLAS_INT long 
+    #define BLAS_INT long long int
  #endif
 #else
   #define BLAS_INT int
diff --git a/lib/TH/generic/THLapack.c b/lib/TH/generic/THLapack.c
index 148ae26c4..910c19ca5 100644
--- a/lib/TH/generic/THLapack.c
+++ b/lib/TH/generic/THLapack.c
@@ -2,43 +2,42 @@
 #define TH_GENERIC_FILE "generic/THLapack.c"
 #else
 
-
-TH_EXTERNC void dgesv_(int *n, int *nrhs, double *a, int *lda, int *ipiv, double *b, int *ldb, int *info);
-TH_EXTERNC void sgesv_(int *n, int *nrhs, float *a, int *lda, int *ipiv, float *b, int *ldb, int *info);
-TH_EXTERNC void dtrtrs_(char *uplo, char *trans, char *diag, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, int *info);
-TH_EXTERNC void strtrs_(char *uplo, char *trans, char *diag, int *n, int *nrhs, float *a, int *lda, float *b, int *ldb, int *info);
-TH_EXTERNC void dgels_(char *trans, int *m, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, double *work, int *lwork, int *info);
-TH_EXTERNC void sgels_(char *trans, int *m, int *n, int *nrhs, float *a, int *lda, float *b, int *ldb, float *work, int *lwork, int *info);
-TH_EXTERNC void dsyev_(char *jobz, char *uplo, int *n, double *a, int *lda, double *w, double *work, int *lwork, int *info);
-TH_EXTERNC void ssyev_(char *jobz, char *uplo, int *n, float *a, int *lda, float *w, float *work, int *lwork, int *info);
-TH_EXTERNC void dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda, double *wr, double *wi, double* vl, int *ldvl, double *vr, int *ldvr, double *work, int *lwork, int *info);
-TH_EXTERNC void sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda, float *wr, float *wi, float* vl, int *ldvl, float *vr, int *ldvr, float *work, int *lwork, int *info);
-TH_EXTERNC void dgesvd_(char *jobu, char *jobvt, int *m, int *n, double *a, int *lda, double *s, double *u, int *ldu, double *vt, int *ldvt, double *work, int *lwork, int *info);
-TH_EXTERNC void sgesvd_(char *jobu, char *jobvt, int *m, int *n, float *a, int *lda, float *s, float *u, int *ldu, float *vt, int *ldvt, float *work, int *lwork, int *info);
-TH_EXTERNC void dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv, int *info);
-TH_EXTERNC void sgetrf_(int *m, int *n, float *a, int *lda, int *ipiv, int *info);
-TH_EXTERNC void dgetrs_(char *trans, int *n, int *nrhs, double *a, int *lda, int *ipiv, double *b, int *ldb, int *info);
-TH_EXTERNC void sgetrs_(char *trans, int *n, int *nrhs, float *a, int *lda, int *ipiv, float *b, int *ldb, int *info);
-TH_EXTERNC void dgetri_(int *n, double *a, int *lda, int *ipiv, double *work, int *lwork, int *info);
-TH_EXTERNC void sgetri_(int *n, float *a, int *lda, int *ipiv, float *work, int *lwork, int *info);
-TH_EXTERNC void dpotrf_(char *uplo, int *n, double *a, int *lda, int *info);
-TH_EXTERNC void spotrf_(char *uplo, int *n, float *a, int *lda, int *info);
-TH_EXTERNC void dpotri_(char *uplo, int *n, double *a, int *lda, int *info);
-TH_EXTERNC void spotri_(char *uplo, int *n, float *a, int *lda, int *info);
-TH_EXTERNC void dpotrs_(char *uplo, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, int *info);
-TH_EXTERNC void spotrs_(char *uplo, int *n, int *nrhs, float *a, int *lda, float *b, int *ldb, int *info);
-TH_EXTERNC void sgeqrf_(int *m, int *n, float *a, int *lda, float *tau, float *work, int *lwork, int *info);
-TH_EXTERNC void dgeqrf_(int *m, int *n, double *a, int *lda, double *tau, double *work, int *lwork, int *info);
-TH_EXTERNC void sorgqr_(int *m, int *n, int *k, float *a, int *lda, float *tau, float *work, int *lwork, int *info);
-TH_EXTERNC void dorgqr_(int *m, int *n, int *k, double *a, int *lda, double *tau, double *work, int *lwork, int *info);
-TH_EXTERNC void sormqr_(char *side, char *trans, int *m, int *n, int *k, float *a, int *lda, float *tau, float *c, int *ldc, float *work, int *lwork, int *info);
-TH_EXTERNC void dormqr_(char *side, char *trans, int *m, int *n, int *k, double *a, int *lda, double *tau, double *c, int *ldc, double *work, int *lwork, int *info);
-TH_EXTERNC void spstrf_(char *uplo, int *n, float *a, int *lda, int *piv, int *rank, float *tol, float *work, int *info);
-TH_EXTERNC void dpstrf_(char *uplo, int *n, double *a, int *lda, int *piv, int *rank, double *tol, double *work, int *info);
+TH_EXTERNC void dgesv_(LAPACK_INT *n, LAPACK_INT *nrhs, double *a, LAPACK_INT *lda, LAPACK_INT *ipiv, double *b, LAPACK_INT *ldb, LAPACK_INT *info);
+TH_EXTERNC void sgesv_(LAPACK_INT *n, LAPACK_INT *nrhs, float *a, LAPACK_INT *lda, LAPACK_INT *ipiv, float *b, LAPACK_INT *ldb, LAPACK_INT *info);
+TH_EXTERNC void dtrtrs_(char *uplo, char *trans, char *diag, LAPACK_INT *n, LAPACK_INT *nrhs, double *a, LAPACK_INT *lda, double *b, LAPACK_INT *ldb, LAPACK_INT *info);
+TH_EXTERNC void strtrs_(char *uplo, char *trans, char *diag, LAPACK_INT *n, LAPACK_INT *nrhs, float *a, LAPACK_INT *lda, float *b, LAPACK_INT *ldb, LAPACK_INT *info);
+TH_EXTERNC void dgels_(char *trans, LAPACK_INT *m, LAPACK_INT *n, LAPACK_INT *nrhs, double *a, LAPACK_INT *lda, double *b, LAPACK_INT *ldb, double *work, LAPACK_INT *lwork, LAPACK_INT *info);
+TH_EXTERNC void sgels_(char *trans, LAPACK_INT *m, LAPACK_INT *n, LAPACK_INT *nrhs, float *a, LAPACK_INT *lda, float *b, LAPACK_INT *ldb, float *work, LAPACK_INT *lwork, LAPACK_INT *info);
+TH_EXTERNC void dsyev_(char *jobz, char *uplo, LAPACK_INT *n, double *a, LAPACK_INT *lda, double *w, double *work, LAPACK_INT *lwork, LAPACK_INT *info);
+TH_EXTERNC void ssyev_(char *jobz, char *uplo, LAPACK_INT *n, float *a, LAPACK_INT *lda, float *w, float *work, LAPACK_INT *lwork, LAPACK_INT *info);
+TH_EXTERNC void dgeev_(char *jobvl, char *jobvr, LAPACK_INT *n, double *a, LAPACK_INT *lda, double *wr, double *wi, double* vl, LAPACK_INT *ldvl, double *vr, LAPACK_INT *ldvr, double *work, LAPACK_INT *lwork, LAPACK_INT *info);
+TH_EXTERNC void sgeev_(char *jobvl, char *jobvr, LAPACK_INT *n, float *a, LAPACK_INT *lda, float *wr, float *wi, float* vl, LAPACK_INT *ldvl, float *vr, LAPACK_INT *ldvr, float *work, LAPACK_INT *lwork, LAPACK_INT *info);
+TH_EXTERNC void dgesvd_(char *jobu, char *jobvt, LAPACK_INT *m, LAPACK_INT *n, double *a, LAPACK_INT *lda, double *s, double *u, LAPACK_INT *ldu, double *vt, LAPACK_INT *ldvt, double *work, LAPACK_INT *lwork, LAPACK_INT *info);
+TH_EXTERNC void sgesvd_(char *jobu, char *jobvt, LAPACK_INT *m, LAPACK_INT *n, float *a, LAPACK_INT *lda, float *s, float *u, LAPACK_INT *ldu, float *vt, LAPACK_INT *ldvt, float *work, LAPACK_INT *lwork, LAPACK_INT *info);
+TH_EXTERNC void dgetrf_(LAPACK_INT *m, LAPACK_INT *n, double *a, LAPACK_INT *lda, LAPACK_INT *ipiv, LAPACK_INT *info);
+TH_EXTERNC void sgetrf_(LAPACK_INT *m, LAPACK_INT *n, float *a, LAPACK_INT *lda, LAPACK_INT *ipiv, LAPACK_INT *info);
+TH_EXTERNC void dgetrs_(char *trans, LAPACK_INT *n, LAPACK_INT *nrhs, double *a, LAPACK_INT *lda, LAPACK_INT *ipiv, double *b, LAPACK_INT *ldb, LAPACK_INT *info);
+TH_EXTERNC void sgetrs_(char *trans, LAPACK_INT *n, LAPACK_INT *nrhs, float *a, LAPACK_INT *lda, LAPACK_INT *ipiv, float *b, LAPACK_INT *ldb, LAPACK_INT *info);
+TH_EXTERNC void dgetri_(LAPACK_INT *n, double *a, LAPACK_INT *lda, LAPACK_INT *ipiv, double *work, LAPACK_INT *lwork, LAPACK_INT *info);
+TH_EXTERNC void sgetri_(LAPACK_INT *n, float *a, LAPACK_INT *lda, LAPACK_INT *ipiv, float *work, LAPACK_INT *lwork, LAPACK_INT *info);
+TH_EXTERNC void dpotrf_(char *uplo, LAPACK_INT *n, double *a, LAPACK_INT *lda, LAPACK_INT *info);
+TH_EXTERNC void spotrf_(char *uplo, LAPACK_INT *n, float *a, LAPACK_INT *lda, LAPACK_INT *info);
+TH_EXTERNC void dpotri_(char *uplo, LAPACK_INT *n, double *a, LAPACK_INT *lda, LAPACK_INT *info);
+TH_EXTERNC void spotri_(char *uplo, LAPACK_INT *n, float *a, LAPACK_INT *lda, LAPACK_INT *info);
+TH_EXTERNC void dpotrs_(char *uplo, LAPACK_INT *n, LAPACK_INT *nrhs, double *a, LAPACK_INT *lda, double *b, LAPACK_INT *ldb, LAPACK_INT *info);
+TH_EXTERNC void spotrs_(char *uplo, LAPACK_INT *n, LAPACK_INT *nrhs, float *a, LAPACK_INT *lda, float *b, LAPACK_INT *ldb, LAPACK_INT *info);
+TH_EXTERNC void sgeqrf_(LAPACK_INT *m, LAPACK_INT *n, float *a, LAPACK_INT *lda, float *tau, float *work, LAPACK_INT *lwork, LAPACK_INT *info);
+TH_EXTERNC void dgeqrf_(LAPACK_INT *m, LAPACK_INT *n, double *a, LAPACK_INT *lda, double *tau, double *work, LAPACK_INT *lwork, LAPACK_INT *info);
+TH_EXTERNC void sorgqr_(LAPACK_INT *m, LAPACK_INT *n, LAPACK_INT *k, float *a, LAPACK_INT *lda, float *tau, float *work, LAPACK_INT *lwork, LAPACK_INT *info);
+TH_EXTERNC void dorgqr_(LAPACK_INT *m, LAPACK_INT *n, LAPACK_INT *k, double *a, LAPACK_INT *lda, double *tau, double *work, LAPACK_INT *lwork, LAPACK_INT *info);
+TH_EXTERNC void sormqr_(char *side, char *trans, LAPACK_INT *m, LAPACK_INT *n, LAPACK_INT *k, float *a, LAPACK_INT *lda, float *tau, float *c, LAPACK_INT *ldc, float *work, LAPACK_INT *lwork, LAPACK_INT *info);
+TH_EXTERNC void dormqr_(char *side, char *trans, LAPACK_INT *m, LAPACK_INT *n, LAPACK_INT *k, double *a, LAPACK_INT *lda, double *tau, double *c, LAPACK_INT *ldc, double *work, LAPACK_INT *lwork, LAPACK_INT *info);
+TH_EXTERNC void spstrf_(char *uplo, LAPACK_INT *n, float *a, LAPACK_INT *lda, LAPACK_INT *piv, LAPACK_INT *rank, float *tol, float *work, LAPACK_INT *info);
+TH_EXTERNC void dpstrf_(char *uplo, LAPACK_INT *n, double *a, LAPACK_INT *lda, LAPACK_INT *piv, LAPACK_INT *rank, double *tol, double *work, LAPACK_INT *info);
 
 
 /* Compute the solution to a real system of linear equations  A * X = B */
-void THLapack_(gesv)(int n, int nrhs, real *a, int lda, int *ipiv, real *b, int ldb, int* info)
+void THLapack_(gesv)(LAPACK_INT n, LAPACK_INT nrhs, real *a, LAPACK_INT lda, LAPACK_INT *ipiv, real *b, LAPACK_INT ldb, LAPACK_INT* info)
 {
 #ifdef USE_LAPACK
 #if defined(TH_REAL_IS_DOUBLE)
@@ -53,7 +52,7 @@ void THLapack_(gesv)(int n, int nrhs, real *a, int lda, int *ipiv, real *b, int
 }
 
 /* Solve a triangular system of the form A * X = B  or A^T * X = B */
-void THLapack_(trtrs)(char uplo, char trans, char diag, int n, int nrhs, real *a, int lda, real *b, int ldb, int* info)
+void THLapack_(trtrs)(char uplo, char trans, char diag, LAPACK_INT n, LAPACK_INT nrhs, real *a, LAPACK_INT lda, real *b, LAPACK_INT ldb, LAPACK_INT* info)
 {
 #ifdef USE_LAPACK
 #if defined(TH_REAL_IS_DOUBLE)
@@ -69,7 +68,7 @@ void THLapack_(trtrs)(char uplo, char trans, char diag, int n, int nrhs, real *a
 
 /* Solve overdetermined or underdetermined real linear systems involving an
 M-by-N matrix A, or its transpose, using a QR or LQ factorization of A */
-void THLapack_(gels)(char trans, int m, int n, int nrhs, real *a, int lda, real *b, int ldb, real *work, int lwork, int *info)
+void THLapack_(gels)(char trans, LAPACK_INT m, LAPACK_INT n, LAPACK_INT nrhs, real *a, LAPACK_INT lda, real *b, LAPACK_INT ldb, real *work, LAPACK_INT lwork, LAPACK_INT *info)
 {
 #ifdef USE_LAPACK
 #if defined(TH_REAL_IS_DOUBLE)
@@ -84,7 +83,7 @@ void THLapack_(gels)(char trans, int m, int n, int nrhs, real *a, int lda, real
 
 /* Compute all eigenvalues and, optionally, eigenvectors of a real symmetric
 matrix A */
-void THLapack_(syev)(char jobz, char uplo, int n, real *a, int lda, real *w, real *work, int lwork, int *info)
+void THLapack_(syev)(char jobz, char uplo, LAPACK_INT n, real *a, LAPACK_INT lda, real *w, real *work, LAPACK_INT lwork, LAPACK_INT *info)
 {
 #ifdef USE_LAPACK
 #if defined(TH_REAL_IS_DOUBLE)
@@ -99,7 +98,7 @@ void THLapack_(syev)(char jobz, char uplo, int n, real *a, int lda, real *w, rea
 
 /* Compute for an N-by-N real nonsymmetric matrix A, the eigenvalues and,
 optionally, the left and/or right eigenvectors */
-void THLapack_(geev)(char jobvl, char jobvr, int n, real *a, int lda, real *wr, real *wi, real* vl, int ldvl, real *vr, int ldvr, real *work, int lwork, int *info)
+void THLapack_(geev)(char jobvl, char jobvr, LAPACK_INT n, real *a, LAPACK_INT lda, real *wr, real *wi, real* vl, LAPACK_INT ldvl, real *vr, LAPACK_INT ldvr, real *work, LAPACK_INT lwork, LAPACK_INT *info)
 {
 #ifdef USE_LAPACK
 #if defined(TH_REAL_IS_DOUBLE)
@@ -114,7 +113,7 @@ void THLapack_(geev)(char jobvl, char jobvr, int n, real *a, int lda, real *wr,
 
 /* Compute the singular value decomposition (SVD) of a real M-by-N matrix A,
 optionally computing the left and/or right singular vectors */
-void THLapack_(gesvd)(char jobu, char jobvt, int m, int n, real *a, int lda, real *s, real *u, int ldu, real *vt, int ldvt, real *work, int lwork, int *info)
+void THLapack_(gesvd)(char jobu, char jobvt, LAPACK_INT m, LAPACK_INT n, real *a, LAPACK_INT lda, real *s, real *u, LAPACK_INT ldu, real *vt, LAPACK_INT ldvt, real *work, LAPACK_INT lwork, LAPACK_INT *info)
 {
 #ifdef USE_LAPACK
 #if defined(TH_REAL_IS_DOUBLE)
@@ -128,7 +127,7 @@ void THLapack_(gesvd)(char jobu, char jobvt, int m, int n, real *a, int lda, rea
 }
 
 /* LU decomposition */
-void THLapack_(getrf)(int m, int n, real *a, int lda, int *ipiv, int *info)
+void THLapack_(getrf)(LAPACK_INT m, LAPACK_INT n, real *a, LAPACK_INT lda, LAPACK_INT *ipiv, LAPACK_INT *info)
 {
 #ifdef  USE_LAPACK
 #if defined(TH_REAL_IS_DOUBLE)
@@ -141,7 +140,7 @@ void THLapack_(getrf)(int m, int n, real *a, int lda, int *ipiv, int *info)
 #endif
 }
 
-void THLapack_(getrs)(char trans, int n, int nrhs, real *a, int lda, int *ipiv, real *b, int ldb, int *info)
+void THLapack_(getrs)(char trans, LAPACK_INT n, LAPACK_INT nrhs, real *a, LAPACK_INT lda, LAPACK_INT *ipiv, real *b, LAPACK_INT ldb, LAPACK_INT *info)
 {
 #ifdef  USE_LAPACK
 #if defined(TH_REAL_IS_DOUBLE)
@@ -155,7 +154,7 @@ void THLapack_(getrs)(char trans, int n, int nrhs, real *a, int lda, int *ipiv,
 }
 
 /* Matrix Inverse */
-void THLapack_(getri)(int n, real *a, int lda, int *ipiv, real *work, int lwork, int* info)
+void THLapack_(getri)(LAPACK_INT n, real *a, LAPACK_INT lda, LAPACK_INT *ipiv, real *work, LAPACK_INT lwork, LAPACK_INT* info)
 {
 #ifdef  USE_LAPACK
 #if defined(TH_REAL_IS_DOUBLE)
@@ -169,7 +168,7 @@ void THLapack_(getri)(int n, real *a, int lda, int *ipiv, real *work, int lwork,
 }
 
 /* Cholesky factorization */
-void THLapack_(potrf)(char uplo, int n, real *a, int lda, int *info)
+void THLapack_(potrf)(char uplo, LAPACK_INT n, real *a, LAPACK_INT lda, LAPACK_INT *info)
 {
 #ifdef  USE_LAPACK
 #if defined(TH_REAL_IS_DOUBLE)
@@ -183,7 +182,7 @@ void THLapack_(potrf)(char uplo, int n, real *a, int lda, int *info)
 }
 
 /* Solve A*X = B with a symmetric positive definite matrix A using the Cholesky factorization */
-void THLapack_(potrs)(char uplo, int n, int nrhs, real *a, int lda, real *b, int ldb, int *info)
+void THLapack_(potrs)(char uplo, LAPACK_INT n, LAPACK_INT nrhs, real *a, LAPACK_INT lda, real *b, LAPACK_INT ldb, LAPACK_INT *info)
 {
 #ifdef  USE_LAPACK
 #if defined(TH_REAL_IS_DOUBLE)
@@ -197,7 +196,7 @@ void THLapack_(potrs)(char uplo, int n, int nrhs, real *a, int lda, real *b, int
 }
 
 /* Cholesky factorization based Matrix Inverse */
-void THLapack_(potri)(char uplo, int n, real *a, int lda, int *info)
+void THLapack_(potri)(char uplo, LAPACK_INT n, real *a, LAPACK_INT lda, LAPACK_INT *info)
 {
 #ifdef  USE_LAPACK
 #if defined(TH_REAL_IS_DOUBLE)
@@ -211,7 +210,7 @@ void THLapack_(potri)(char uplo, int n, real *a, int lda, int *info)
 }
 
 /* Cholesky factorization with complete pivoting */
-void THLapack_(pstrf)(char uplo, int n, real *a, int lda, int *piv, int *rank, real tol, real *work, int *info)
+void THLapack_(pstrf)(char uplo, LAPACK_INT n, real *a, LAPACK_INT lda, LAPACK_INT *piv, LAPACK_INT *rank, real tol, real *work, LAPACK_INT *info)
 {
 #ifdef  USE_LAPACK
 #if defined(TH_REAL_IS_DOUBLE)
@@ -225,7 +224,7 @@ void THLapack_(pstrf)(char uplo, int n, real *a, int lda, int *piv, int *rank, r
 }
 
 /* QR decomposition */
-void THLapack_(geqrf)(int m, int n, real *a, int lda, real *tau, real *work, int lwork, int *info)
+void THLapack_(geqrf)(LAPACK_INT m, LAPACK_INT n, real *a, LAPACK_INT lda, real *tau, real *work, LAPACK_INT lwork, LAPACK_INT *info)
 {
 #ifdef  USE_LAPACK
 #if defined(TH_REAL_IS_DOUBLE)
@@ -239,7 +238,7 @@ void THLapack_(geqrf)(int m, int n, real *a, int lda, real *tau, real *work, int
 }
 
 /* Build Q from output of geqrf */
-void THLapack_(orgqr)(int m, int n, int k, real *a, int lda, real *tau, real *work, int lwork, int *info)
+void THLapack_(orgqr)(LAPACK_INT m, LAPACK_INT n, LAPACK_INT k, real *a, LAPACK_INT lda, real *tau, real *work, LAPACK_INT lwork, LAPACK_INT *info)
 {
 #ifdef  USE_LAPACK
 #if defined(TH_REAL_IS_DOUBLE)
@@ -253,7 +252,7 @@ void THLapack_(orgqr)(int m, int n, int k, real *a, int lda, real *tau, real *wo
 }
 
 /* Multiply Q with a matrix using the output of geqrf */
-void THLapack_(ormqr)(char side, char trans, int m, int n, int k, real *a, int lda, real *tau, real *c, int ldc, real *work, int lwork, int *info)
+void THLapack_(ormqr)(char side, char trans, LAPACK_INT m, LAPACK_INT n, LAPACK_INT k, real *a, LAPACK_INT lda, real *tau, real *c, LAPACK_INT ldc, real *work, LAPACK_INT lwork, LAPACK_INT *info)
 {
 #ifdef  USE_LAPACK
 #if defined(TH_REAL_IS_DOUBLE)
diff --git a/lib/TH/generic/THLapack.h b/lib/TH/generic/THLapack.h
index b464dd2d2..1aafd6581 100644
--- a/lib/TH/generic/THLapack.h
+++ b/lib/TH/generic/THLapack.h
@@ -3,38 +3,38 @@
 #else
 
 /* AX=B */
-TH_API void THLapack_(gesv)(int n, int nrhs, real *a, int lda, int *ipiv, real *b, int ldb, int* info);
+TH_API void THLapack_(gesv)(LAPACK_INT n, LAPACK_INT nrhs, real *a, LAPACK_INT lda, LAPACK_INT *ipiv, real *b, LAPACK_INT ldb, LAPACK_INT* info);
 /* Solve a triangular system of the form A * X = B  or A^T * X = B */
-TH_API void THLapack_(trtrs)(char uplo, char trans, char diag, int n, int nrhs, real *a, int lda, real *b, int ldb, int* info);
+TH_API void THLapack_(trtrs)(char uplo, char trans, char diag, LAPACK_INT n, LAPACK_INT nrhs, real *a, LAPACK_INT lda, real *b, LAPACK_INT ldb, LAPACK_INT* info);
 /* ||AX-B|| */
-TH_API void THLapack_(gels)(char trans, int m, int n, int nrhs, real *a, int lda, real *b, int ldb, real *work, int lwork, int *info);
+TH_API void THLapack_(gels)(char trans, LAPACK_INT m, LAPACK_INT n, LAPACK_INT nrhs, real *a, LAPACK_INT lda, real *b, LAPACK_INT ldb, real *work, LAPACK_INT lwork, LAPACK_INT *info);
 /* Eigenvals */
-TH_API void THLapack_(syev)(char jobz, char uplo, int n, real *a, int lda, real *w, real *work, int lwork, int *info);
+TH_API void THLapack_(syev)(char jobz, char uplo, LAPACK_INT n, real *a, LAPACK_INT lda, real *w, real *work, LAPACK_INT lwork, LAPACK_INT *info);
 /* Non-sym eigenvals */
-TH_API void THLapack_(geev)(char jobvl, char jobvr, int n, real *a, int lda, real *wr, real *wi, real* vl, int ldvl, real *vr, int ldvr, real *work, int lwork, int *info);
+TH_API void THLapack_(geev)(char jobvl, char jobvr, LAPACK_INT n, real *a, LAPACK_INT lda, real *wr, real *wi, real* vl, LAPACK_INT ldvl, real *vr, LAPACK_INT ldvr, real *work, LAPACK_INT lwork, LAPACK_INT *info);
 /* svd */
-TH_API void THLapack_(gesvd)(char jobu, char jobvt, int m, int n, real *a, int lda, real *s, real *u, int ldu, real *vt, int ldvt, real *work, int lwork, int *info);
+TH_API void THLapack_(gesvd)(char jobu, char jobvt, LAPACK_INT m, LAPACK_INT n, real *a, LAPACK_INT lda, real *s, real *u, LAPACK_INT ldu, real *vt, LAPACK_INT ldvt, real *work, LAPACK_INT lwork, LAPACK_INT *info);
 /* LU decomposition */
-TH_API void THLapack_(getrf)(int m, int n, real *a, int lda, int *ipiv, int *info);
-TH_API void THLapack_(getrs)(char trans, int n, int nrhs, real *a, int lda, int *ipiv, real *b, int ldb, int *info);
+TH_API void THLapack_(getrf)(LAPACK_INT m, LAPACK_INT n, real *a, LAPACK_INT lda, LAPACK_INT *ipiv, LAPACK_INT *info);
+TH_API void THLapack_(getrs)(char trans, LAPACK_INT n, LAPACK_INT nrhs, real *a, LAPACK_INT lda, LAPACK_INT *ipiv, real *b, LAPACK_INT ldb, LAPACK_INT *info);
 /* Matrix Inverse */
-TH_API void THLapack_(getri)(int n, real *a, int lda, int *ipiv, real *work, int lwork, int* info);
+TH_API void THLapack_(getri)(LAPACK_INT n, real *a, LAPACK_INT lda, LAPACK_INT *ipiv, real *work, LAPACK_INT lwork, LAPACK_INT* info);
 
 /* Positive Definite matrices */
 /* Cholesky factorization */
-void THLapack_(potrf)(char uplo, int n, real *a, int lda, int *info);
+void THLapack_(potrf)(char uplo, LAPACK_INT n, real *a, LAPACK_INT lda, LAPACK_INT *info);
 /* Matrix inverse based on Cholesky factorization */
-void THLapack_(potri)(char uplo, int n, real *a, int lda, int *info);
+void THLapack_(potri)(char uplo, LAPACK_INT n, real *a, LAPACK_INT lda, LAPACK_INT *info);
 /* Solve A*X = B with a symmetric positive definite matrix A using the Cholesky factorization */
-void THLapack_(potrs)(char uplo, int n, int nrhs, real *a, int lda, real *b, int ldb, int *info);
+void THLapack_(potrs)(char uplo, LAPACK_INT n, LAPACK_INT nrhs, real *a, LAPACK_INT lda, real *b, LAPACK_INT ldb, LAPACK_INT *info);
 /* Cholesky factorization with complete pivoting. */
-void THLapack_(pstrf)(char uplo, int n, real *a, int lda, int *piv, int *rank, real tol, real *work, int *info);
+void THLapack_(pstrf)(char uplo, LAPACK_INT n, real *a, LAPACK_INT lda, LAPACK_INT *piv, LAPACK_INT *rank, real tol, real *work, LAPACK_INT *info);
 
 /* QR decomposition */
-void THLapack_(geqrf)(int m, int n, real *a, int lda, real *tau, real *work, int lwork, int *info);
+void THLapack_(geqrf)(LAPACK_INT m, LAPACK_INT n, real *a, LAPACK_INT lda, real *tau, real *work, LAPACK_INT lwork, LAPACK_INT *info);
 /* Build Q from output of geqrf */
-void THLapack_(orgqr)(int m, int n, int k, real *a, int lda, real *tau, real *work, int lwork, int *info);
+void THLapack_(orgqr)(LAPACK_INT m, LAPACK_INT n, LAPACK_INT k, real *a, LAPACK_INT lda, real *tau, real *work, LAPACK_INT lwork, LAPACK_INT *info);
 /* Multiply Q with a matrix from output of geqrf */
-void THLapack_(ormqr)(char side, char trans, int m, int n, int k, real *a, int lda, real *tau, real *c, int ldc, real *work, int lwork, int *info);
+void THLapack_(ormqr)(char side, char trans, LAPACK_INT m, LAPACK_INT n, LAPACK_INT k, real *a, LAPACK_INT lda, real *tau, real *c, LAPACK_INT ldc, real *work, LAPACK_INT lwork, LAPACK_INT *info);
 
 #endif
diff --git a/lib/TH/generic/THTensorLapack.c b/lib/TH/generic/THTensorLapack.c
index d0196c98d..17c9e5c6c 100644
--- a/lib/TH/generic/THTensorLapack.c
+++ b/lib/TH/generic/THTensorLapack.c
@@ -121,35 +121,35 @@ void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
     free_b = 1;
   }
 
-  int n, nrhs, lda, ldb, info;
-  THIntTensor *ipiv;
+  LAPACK_INT n, nrhs, lda, ldb, info;
+  LAPACK_INT *ipiv;
   THTensor *ra__;  // working version of A matrix to be passed into lapack GELS
   THTensor *rb__;  // working version of B matrix to be passed into lapack GELS
 
   ra__ = THTensor_(cloneColumnMajor)(ra_, a);
   rb__ = THTensor_(cloneColumnMajor)(rb_, b);
 
-  n    = (int)ra__->size[0];
-  nrhs = (int)rb__->size[1];
+  n    = (LAPACK_INT)ra__->size[0];
+  nrhs = (LAPACK_INT)rb__->size[1];
   lda  = n;
   ldb  = n;
 
-  ipiv = THIntTensor_newWithSize1d((long)n);
+  ipiv = (LAPACK_INT*)THAlloc(n * sizeof(LAPACK_INT));
   THLapack_(gesv)(n, nrhs,
-		  THTensor_(data)(ra__), lda, THIntTensor_data(ipiv),
+		  THTensor_(data)(ra__), lda, ipiv,
 		  THTensor_(data)(rb__), ldb, &info);
 
   THLapackCheckWithCleanup("Lapack Error in %s : U(%d,%d) is zero, singular U.",
                            THCleanup(
                                THTensor_(free)(ra__);
                                THTensor_(free)(rb__);
-                               THIntTensor_free(ipiv);
+                               THFree(ipiv);
                                if (free_b) THTensor_(free)(b);),
                            "gesv", info, info);
 
   THTensor_(freeCopyTo)(ra__, ra_);
   THTensor_(freeCopyTo)(rb__, rb_);
-  THIntTensor_free(ipiv);
+  THFree(ipiv);
   if (free_b) THTensor_(free)(b);
 }
 
@@ -174,15 +174,15 @@ void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a,
     free_b = 1;
   }
 
-  int n, nrhs, lda, ldb, info;
+  LAPACK_INT n, nrhs, lda, ldb, info;
   THTensor *ra__; // working version of A matrix to be passed into lapack TRTRS
   THTensor *rb__; // working version of B matrix to be passed into lapack TRTRS
 
   ra__ = THTensor_(cloneColumnMajor)(ra_, a);
   rb__ = THTensor_(cloneColumnMajor)(rb_, b);
 
-  n    = (int)ra__->size[0];
-  nrhs = (int)rb__->size[1];
+  n    = (LAPACK_INT)ra__->size[0];
+  nrhs = (LAPACK_INT)rb__->size[1];
   lda  = n;
   ldb  = n;
 
@@ -222,7 +222,7 @@ void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
     free_b = 1;
   }
 
-  int m, n, nrhs, lda, ldb, info, lwork;
+  LAPACK_INT m, n, nrhs, lda, ldb, info, lwork;
   THTensor *work = NULL;
   real wkopt = 0;
 
@@ -231,8 +231,8 @@ void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
 
   ra__ = THTensor_(cloneColumnMajor)(ra_, a);
 
-  m = ra__->size[0];
-  n = ra__->size[1];
+  m = (LAPACK_INT)ra__->size[0];
+  n = (LAPACK_INT)ra__->size[1];
   lda = m;
   ldb = (m > n) ? m : n;
 
@@ -272,7 +272,7 @@ void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
 
 void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *jobvr)
 {
-  int n, lda, lwork, info, ldvr;
+  LAPACK_INT n, lda, lwork, info, ldvr;
   THTensor *work, *wi, *wr, *a;
   real wkopt;
   real *rv_data;
@@ -287,7 +287,7 @@ void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *job
   /* we want to definitely clone a_ for geev*/
   a = THTensor_(cloneColumnMajor)(NULL, a_);
 
-  n = a->size[0];
+  n = (LAPACK_INT)a->size[0];
   lda = n;
 
   wi = THTensor_(newWithSize1d)(n);
@@ -310,7 +310,7 @@ void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *job
   THLapack_(geev)('N', jobvr[0], n, THTensor_(data)(a), lda, THTensor_(data)(wr), THTensor_(data)(wi),
       NULL, 1, rv_data, ldvr, &wkopt, -1, &info);
 
-  lwork = (int)wkopt;
+  lwork = (LAPACK_INT)wkopt;
   work = THTensor_(newWithSize1d)(lwork);
 
   THLapack_(geev)('N', jobvr[0], n, THTensor_(data)(a), lda, THTensor_(data)(wr), THTensor_(data)(wi),
@@ -354,7 +354,7 @@ void THTensor_(syev)(THTensor *re_, THTensor *rv_, THTensor *a, const char *jobz
   THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
   THArgCheck(a->size[0] == a->size[1], 1,"A should be square");
 
-  int n, lda, lwork, info;
+  LAPACK_INT n, lda, lwork, info;
   THTensor *work;
   real wkopt;
 
@@ -363,7 +363,7 @@ void THTensor_(syev)(THTensor *re_, THTensor *rv_, THTensor *a, const char *jobz
 
   rv__ = THTensor_(cloneColumnMajor)(rv_, a);
 
-  n = rv__->size[0];
+  n = (LAPACK_INT)rv__->size[0];
   lda = n;
 
   THTensor_(resize1d)(re_,n);
@@ -372,7 +372,7 @@ void THTensor_(syev)(THTensor *re_, THTensor *rv_, THTensor *a, const char *jobz
   /* get optimal workspace size */
   THLapack_(syev)(jobz[0], uplo[0], n, THTensor_(data)(rv__), lda,
 		  THTensor_(data)(re_), &wkopt, -1, &info);
-  lwork = (int)wkopt;
+  lwork = (LAPACK_INT)wkopt;
   work = THTensor_(newWithSize1d)(lwork);
   THLapack_(syev)(jobz[0], uplo[0], n, THTensor_(data)(rv__), lda,
 		  THTensor_(data)(re_), THTensor_(data)(work), lwork, &info);
@@ -400,7 +400,7 @@ void THTensor_(gesvd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra
   if (a == NULL) a = ra_;
   THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
 
-  int k,m, n, lda, ldu, ldvt, lwork, info;
+  LAPACK_INT k,m, n, lda, ldu, ldvt, lwork, info;
   THTensor *work;
   THTensor *rvf_ = THTensor_(new)();
   real wkopt;
@@ -412,8 +412,8 @@ void THTensor_(gesvd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra
 
   ra__ = THTensor_(cloneColumnMajor)(ra_, a);
 
-  m = ra__->size[0];
-  n = ra__->size[1];
+  m = (LAPACK_INT)ra__->size[0];
+  n = (LAPACK_INT)ra__->size[1];
   k = (m < n ? m : n);
 
   lda = m;
@@ -441,7 +441,7 @@ void THTensor_(gesvd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra
 		   ldu,
 		   THTensor_(data)(rv__), ldvt,
 		   &wkopt, -1, &info);
-  lwork = (int)wkopt;
+  lwork = (LAPACK_INT)wkopt;
   work = THTensor_(newWithSize1d)(lwork);
   THLapack_(gesvd)(jobu[0],jobu[0],
 		   m,n,THTensor_(data)(ra__),lda,
@@ -483,42 +483,42 @@ void THTensor_(getri)(THTensor *ra_, THTensor *a)
   THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
   THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
 
-  int m, n, lda, info, lwork;
+  LAPACK_INT m, n, lda, info, lwork;
   real wkopt;
-  THIntTensor *ipiv;
+  LAPACK_INT *ipiv;
   THTensor *work;
   THTensor *ra__ = NULL;
 
   ra__ = THTensor_(cloneColumnMajor)(ra_, a);
 
-  m = ra__->size[0];
-  n = ra__->size[1];
+  m = (LAPACK_INT)ra__->size[0];
+  n = (LAPACK_INT)ra__->size[1];
   lda = m;
-  ipiv = THIntTensor_newWithSize1d((long)m);
+  ipiv = (LAPACK_INT*) THAlloc(m * sizeof(LAPACK_INT));
 
   /* Run LU */
-  THLapack_(getrf)(n, n, THTensor_(data)(ra__), lda, THIntTensor_data(ipiv), &info);
+  THLapack_(getrf)(n, n, THTensor_(data)(ra__), lda, ipiv, &info);
   THLapackCheckWithCleanup("Lapack Error %s : U(%d,%d) is 0, U is singular",
                            THCleanup(
                                THTensor_(free)(ra__);
-                               THIntTensor_free(ipiv);),
+                               THFree(ipiv);),
                            "getrf", info, info);
 
   /* Run inverse */
-  THLapack_(getri)(n, THTensor_(data)(ra__), lda, THIntTensor_data(ipiv), &wkopt, -1, &info);
-  lwork = (int)wkopt;
+  THLapack_(getri)(n, THTensor_(data)(ra__), lda, ipiv, &wkopt, -1, &info);
+  lwork = (LAPACK_INT)wkopt;
   work = THTensor_(newWithSize1d)(lwork);
-  THLapack_(getri)(n, THTensor_(data)(ra__), lda, THIntTensor_data(ipiv), THTensor_(data)(work), lwork, &info);
+  THLapack_(getri)(n, THTensor_(data)(ra__), lda, ipiv, THTensor_(data)(work), lwork, &info);
   THLapackCheckWithCleanup("Lapack Error %s : U(%d,%d) is 0, U is singular",
                            THCleanup(
                                THTensor_(free)(ra__);
                                THTensor_(free)(work);
-                               THIntTensor_free(ipiv);),
+                               THFree(ipiv);),
                            "getri", info, info);
 
   THTensor_(freeCopyTo)(ra__, ra_);
   THTensor_(free)(work);
-  THIntTensor_free(ipiv);
+  THFree(ipiv);
 }
 
 void THTensor_(clearUpLoTriangle)(THTensor *a, const char *uplo)
@@ -593,12 +593,12 @@ void THTensor_(potrf)(THTensor *ra_, THTensor *a, const char *uplo)
   THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
   THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
 
-  int n, lda, info;
+  LAPACK_INT n, lda, info;
   THTensor *ra__ = NULL;
 
   ra__ = THTensor_(cloneColumnMajor)(ra_, a);
 
-  n = ra__->size[0];
+  n = (LAPACK_INT)ra__->size[0];
   lda = n;
 
   /* Run Factorization */
@@ -631,15 +631,15 @@ void THTensor_(potrs)(THTensor *rb_, THTensor *b, THTensor *a, const char *uplo)
     free_b = 1;
   }
 
-  int n, nrhs, lda, ldb, info;
+  LAPACK_INT n, nrhs, lda, ldb, info;
   THTensor *ra__; // working version of A matrix to be passed into lapack TRTRS
   THTensor *rb__; // working version of B matrix to be passed into lapack TRTRS
 
   ra__ = THTensor_(cloneColumnMajor)(NULL, a);
   rb__ = THTensor_(cloneColumnMajor)(rb_, b);
 
-  n    = (int)ra__->size[0];
-  nrhs = (int)rb__->size[1];
+  n    = (LAPACK_INT)ra__->size[0];
+  nrhs = (LAPACK_INT)rb__->size[1];
   lda  = n;
   ldb  = n;
 
@@ -665,12 +665,12 @@ void THTensor_(potri)(THTensor *ra_, THTensor *a, const char *uplo)
   THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
   THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
 
-  int n, lda, info;
+  LAPACK_INT n, lda, info;
   THTensor *ra__ = NULL;
 
   ra__ = THTensor_(cloneColumnMajor)(ra_, a);
 
-  n = ra__->size[0];
+  n = (LAPACK_INT)ra__->size[0];
   lda = n;
 
   /* Run inverse */
@@ -703,32 +703,58 @@ void THTensor_(pstrf)(THTensor *ra_, THIntTensor *rpiv_, THTensor *a, const char
   THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
   THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
 
-  int n = a->size[0];
+  LAPACK_INT n = a->size[0];
 
   THTensor *ra__ = THTensor_(cloneColumnMajor)(ra_, a);
   THIntTensor_resize1d(rpiv_, n);
 
+  LAPACK_INT *t_rp;
+  if (sizeof(LAPACK_INT) == sizeof(int))
+    t_rp = (LAPACK_INT*)THIntTensor_data(rpiv_);
+  else
+    t_rp = (LAPACK_INT*)THAlloc(n * sizeof(LAPACK_INT));
+
   // Allocate working tensor
   THTensor *work = THTensor_(newWithSize1d)(2 * n);
 
   // Run Cholesky factorization
-  int lda = n;
-  int rank, info;
+  LAPACK_INT lda = n;
+  LAPACK_INT rank, info;
 
   THLapack_(pstrf)(uplo[0], n, THTensor_(data)(ra__), lda,
-                   THIntTensor_data(rpiv_), &rank, tol,
+                   t_rp, &rank, tol,
                    THTensor_(data)(work), &info);
 
-  THLapackCheckWithCleanup("Lapack Error %s : matrix is rank deficient or not positive semidefinite",
-                           THCleanup(
-                               THTensor_(free)(ra__);
-                               THTensor_(free)(work);),
-                           "pstrf", info,"");
+  if (sizeof(LAPACK_INT) == sizeof(int))
+  {
+    THLapackCheckWithCleanup("Lapack Error %s : matrix is rank deficient or not positive semidefinite",
+                             THCleanup(
+                                 THTensor_(free)(ra__);
+                                 THTensor_(free)(work);),
+                             "pstrf", info,"");
+
+  }
+  else
+  {
+    THLapackCheckWithCleanup("Lapack Error %s : matrix is rank deficient or not positive semidefinite",
+                             THCleanup(
+                                 THFree(t_rp);
+                                 THTensor_(free)(ra__);
+                                 THTensor_(free)(work);),
+                             "pstrf", info,"");
+
+    // copy back to int tensor
+    int *pdst = THIntTensor_data(rpiv_);
+    LAPACK_INT *psrc = t_rp;
+    for (int i = 0; i < n; i++) *pdst++ = (int)*psrc++;
+    THFree(t_rp);
+  }
 
   THTensor_(clearUpLoTriangle)(ra__, uplo);
 
   THTensor_(freeCopyTo)(ra__, ra_);
   THTensor_(free)(work);
+
 }
 
 /*
@@ -793,21 +819,21 @@ void THTensor_(geqrf)(THTensor *ra_, THTensor *rtau_, THTensor *a)
   /* Prepare the input for LAPACK, making a copy if necessary. */
   ra__ = THTensor_(cloneColumnMajor)(ra_, a);
 
-  int m = ra__->size[0];
-  int n = ra__->size[1];
+  LAPACK_INT m = (LAPACK_INT)ra__->size[0];
+  LAPACK_INT n = (LAPACK_INT)ra__->size[1];
   int k = (m < n ? m : n);
   int lda = m;
   THTensor_(resize1d)(rtau_, k);
 
   /* Dry-run to query the suggested size of the workspace. */
-  int info = 0;
+  LAPACK_INT info = 0;
   real wkopt = 0;
   THLapack_(geqrf)(m, n, THTensor_(data)(ra__), lda,
                    THTensor_(data)(rtau_),
                    &wkopt, -1, &info);
 
   /* Allocate the workspace and call LAPACK to do the real work. */
-  int lwork = (int)wkopt;
+  int lwork = (LAPACK_INT)wkopt;
   THTensor *work = THTensor_(newWithSize1d)(lwork);
   THLapack_(geqrf)(m, n, THTensor_(data)(ra__), lda,
                    THTensor_(data)(rtau_),
@@ -847,20 +873,20 @@ void THTensor_(orgqr)(THTensor *ra_, THTensor *a, THTensor *tau)
   THTensor *ra__ = NULL;
   ra__ = THTensor_(cloneColumnMajor)(ra_, a);
 
-  int m = ra__->size[0];
-  int n = ra__->size[1];
-  int k = tau->size[0];
+  LAPACK_INT m = (LAPACK_INT)ra__->size[0];
+  LAPACK_INT n = (LAPACK_INT)ra__->size[1];
+  LAPACK_INT k = (LAPACK_INT)tau->size[0];
   int lda = m;
 
   /* Dry-run to query the suggested size of the workspace. */
-  int info = 0;
+  LAPACK_INT info = 0;
   real wkopt = 0;
   THLapack_(orgqr)(m, k, k, THTensor_(data)(ra__), lda,
                    THTensor_(data)(tau),
                    &wkopt, -1, &info);
 
   /* Allocate the workspace and call LAPACK to do the real work. */
-  int lwork = (int)wkopt;
+  int lwork = (LAPACK_INT)wkopt;
   THTensor *work = THTensor_(newWithSize1d)(lwork);
   THLapack_(orgqr)(m, k, k, THTensor_(data)(ra__), lda,
                    THTensor_(data)(tau),
@@ -901,10 +927,10 @@ void THTensor_(ormqr)(THTensor *ra_, THTensor *a, THTensor *tau, THTensor *c, co
   THTensor *ra__ = NULL;
   ra__ = THTensor_(cloneColumnMajor)(ra_, c);
 
-  int m = c->size[0];
-  int n = c->size[1];
-  int k = tau->size[0];
-  int lda;
+  LAPACK_INT m = (LAPACK_INT)c->size[0];
+  LAPACK_INT n = (LAPACK_INT)c->size[1];
+  LAPACK_INT k = (LAPACK_INT)tau->size[0];
+  LAPACK_INT lda;
   if (*side == 'L')
   {
     lda = m;
@@ -913,17 +939,17 @@ void THTensor_(ormqr)(THTensor *ra_, THTensor *a, THTensor *tau, THTensor *c, co
   {
     lda = n;
   }
-  int ldc = m;
+  LAPACK_INT ldc = m;
 
   /* Dry-run to query the suggested size of the workspace. */
-  int info = 0;
+  LAPACK_INT info = 0;
   real wkopt = 0;
   THLapack_(ormqr)(side[0], trans[0], m, n, k, THTensor_(data)(a), lda,
                    THTensor_(data)(tau), THTensor_(data)(ra__), ldc,
                    &wkopt, -1, &info);
 
   /* Allocate the workspace and call LAPACK to do the real work. */
-  int lwork = (int)wkopt;
+  int lwork = (LAPACK_INT)wkopt;
   THTensor *work = THTensor_(newWithSize1d)(lwork);
   THLapack_(ormqr)(side[0], trans[0], m, n, k, THTensor_(data)(a), lda,
                    THTensor_(data)(tau), THTensor_(data)(ra__), ldc,
@@ -947,14 +973,14 @@ void THTensor_(btrifact)(THTensor *ra_, THIntTensor *rpivots_, THIntTensor *rinf
     THTensor_(copy)(ra_, a);
   }
 
-  int m = a->size[1];
-  int n = a->size[2];
+  LAPACK_INT m = (LAPACK_INT)a->size[1];
+  LAPACK_INT n = (LAPACK_INT)a->size[2];
   if (m != n) {
     THError("btrifact is only implemented for square matrices");
   }
   long num_batches = THTensor_(size)(a, 0);
   THTensor *ra__;
-  int lda;
+  LAPACK_INT lda;
 
   if (ra_->stride[1] == 1) {
     // column ordered, what BLAS wants
@@ -973,11 +999,21 @@ void THTensor_(btrifact)(THTensor *ra_, THIntTensor *rpivots_, THIntTensor *rinf
   THTensor *rai = THTensor_(new)();
   THIntTensor *rpivoti = THIntTensor_new();
 
-  int info = 0;
-  int *info_ptr = &info;
+  LAPACK_INT *t_rp;
+  if (sizeof(LAPACK_INT) != sizeof(int))
+    t_rp = (LAPACK_INT*)THAlloc(n * sizeof(LAPACK_INT));
+
+  LAPACK_INT info = 0;
+  LAPACK_INT *info_ptr = &info, *t_inf;
   if (rinfo_) {
     THIntTensor_resize1d(rinfo_, num_batches);
-    info_ptr = THIntTensor_data(rinfo_);
+    if (sizeof(LAPACK_INT) != sizeof(int))
+    {
+      t_inf = (LAPACK_INT*)THAlloc(num_batches * sizeof(LAPACK_INT));
+      info_ptr = t_inf;
+    }
+    else
+      info_ptr = (LAPACK_INT*)THIntTensor_data(rinfo_);
   }
 
   THIntTensor_resize2d(rpivots_, num_batches, n);
@@ -988,8 +1024,19 @@ void THTensor_(btrifact)(THTensor *ra_, THIntTensor *rpivots_, THIntTensor *rinf
     THTensor_(select)(rai, ra__, 0, batch);
     THIntTensor_select(rpivoti, rpivots_, 0, batch);
 
+    if (sizeof(LAPACK_INT) == sizeof(int))
+      t_rp = (LAPACK_INT*)THIntTensor_data(rpivoti);
+
     THLapack_(getrf)(n, n, THTensor_(data)(rai), lda,
-                     THIntTensor_data(rpivoti), info_ptr);
+                     t_rp, info_ptr);
+
+    if (sizeof(LAPACK_INT) != sizeof(int))
+    {
+      int *pdst = THIntTensor_data(rpivoti);
+      LAPACK_INT *psrc = t_rp;
+      for (int i = 0; i < n; i++) *pdst++ = (int)*psrc++;
+    }
+
     if (rinfo_) {
       info_ptr++;
     } else if (info != 0) {
@@ -1001,6 +1048,17 @@ void THTensor_(btrifact)(THTensor *ra_, THIntTensor *rpivots_, THIntTensor *rinf
   THTensor_(free)(rai);
   THIntTensor_free(rpivoti);
 
+  if (sizeof(LAPACK_INT) != sizeof(int))
+  {
+    if (rinfo_) {
+      int *pdst = THIntTensor_data(rinfo_);
+      LAPACK_INT *psrc = t_inf;
+      for (int i = 0; i < n; i++) *pdst++ = (int)*psrc++;
+      THFree(t_inf);
+    }
+    THFree(t_rp);
+  }
+
   if (ra__ != ra_) {
     THTensor_(freeCopyTo)(ra__, ra_);
   }
@@ -1029,10 +1087,10 @@ void THTensor_(btrisolve)(THTensor *rb_, THTensor *b, THTensor *atf, THIntTensor
   }
 
   long num_batches = atf->size[0];
-  long n = atf->size[1];
-  int nrhs = rb_->nDimension > 2 ? rb_->size[2] : 1;
+  LAPACK_INT n = (LAPACK_INT)atf->size[1];
+  LAPACK_INT nrhs = (LAPACK_INT)(rb_->nDimension > 2 ? rb_->size[2] : 1);
 
-  int lda, ldb;
+  LAPACK_INT lda, ldb;
   THTensor *atf_;
   THTensor *rb__;
 
@@ -1084,19 +1142,34 @@ void THTensor_(btrisolve)(THTensor *rb_, THTensor *b, THTensor *atf, THIntTensor
       THError("Error: rpivots_ is not contiguous.");
   }
 
+  LAPACK_INT *t_rp;
+  if (sizeof(LAPACK_INT) != sizeof(int))
+    t_rp = (LAPACK_INT*)THAlloc(n * sizeof(LAPACK_INT));
+
   for (long batch = 0; batch < num_batches; ++batch) {
     THTensor_(select)(ai, atf_, 0, batch);
     THTensor_(select)(rbi, rb__, 0, batch);
     THIntTensor_select(pivoti, pivots, 0, batch);
 
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-    int info;
+
+    if (sizeof(LAPACK_INT) == sizeof(int))
+      t_rp = (LAPACK_INT*)THIntTensor_data(pivoti);
+
+    LAPACK_INT info;
     THLapack_(getrs)('N', n, nrhs, THTensor_(data)(ai), lda,
-                     THIntTensor_data(pivoti), THTensor_(data)(rbi),
+                     t_rp, THTensor_(data)(rbi),
                      ldb, &info);
     if (info != 0) {
       THError("Error: Nonzero info.");
     }
+    if (sizeof(LAPACK_INT) != sizeof(int))
+    {
+      int *pdst = THIntTensor_data(pivoti);
+      LAPACK_INT *psrc = t_rp;
+      for (int i = 0; i < n; i++) *pdst++ = (int)*psrc++;
+    }
+
 #else
     THError("Unimplemented");
 #endif
@@ -1106,6 +1179,9 @@ void THTensor_(btrisolve)(THTensor *rb_, THTensor *b, THTensor *atf, THIntTensor
   THTensor_(free)(rbi);
   THIntTensor_free(pivoti);
 
+  if (sizeof(LAPACK_INT) != sizeof(int))
+    THFree(t_rp);
+
   if (atf_ != atf) {
     THTensor_(free)(atf_);
   }
diff --git a/lib/TH/generic/THTensorMath.c b/lib/TH/generic/THTensorMath.c
index 1dc1bc7f6..69ee4a8e4 100644
--- a/lib/TH/generic/THTensorMath.c
+++ b/lib/TH/generic/THTensorMath.c
@@ -609,11 +609,11 @@ void THTensor_(div)(THTensor *r_, THTensor *t, real value)
 void THTensor_(lshift)(THTensor *r_, THTensor *t, real value)
 {
 #if defined(TH_REAL_IS_FLOAT)
-  return THTensor_(mul)(r_, t, powf(2, value));
+   THTensor_(mul)(r_, t, powf(2, value));
 #elif defined(TH_REAL_IS_DOUBLE)
-  return THTensor_(mul)(r_, t, pow(2, value));
+   THTensor_(mul)(r_, t, pow(2, value));
 #elif defined(TH_REAL_IS_HALF)
-  return THError("lshift is not supported for torch.HalfTensor");
+   THError("lshift is not supported for torch.HalfTensor");
 #else
   THTensor_(resizeAs)(r_, t);
   if (THTensor_(isContiguous)(r_) &&
@@ -644,11 +644,11 @@ void THTensor_(lshift)(THTensor *r_, THTensor *t, real value)
 void THTensor_(rshift)(THTensor *r_, THTensor *t, real value)
 {
 #if defined(TH_REAL_IS_FLOAT)
-  return THTensor_(div)(r_, t, powf(2, value));
+   THTensor_(div)(r_, t, powf(2, value));
 #elif defined(TH_REAL_IS_DOUBLE)
-  return THTensor_(div)(r_, t, pow(2, value));
+   THTensor_(div)(r_, t, pow(2, value));
 #elif defined(TH_REAL_IS_HALF)
-  return THError("rshift is not supported for torch.HalfTensor");
+   THError("rshift is not supported for torch.HalfTensor");
 #else
   THTensor_(resizeAs)(r_, t);
   if (THTensor_(isContiguous)(r_) &&
@@ -735,7 +735,7 @@ void THTensor_(remainder)(THTensor *r_, THTensor *t, real value)
 void THTensor_(bitand)(THTensor *r_, THTensor *t, real value)
 {
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
-  return THError("bitand is only supported for integer type tensors");
+   THError("bitand is only supported for integer type tensors");
 #else
   THTensor_(resizeAs)(r_, t);
   if (THTensor_(isContiguous)(r_) &&
@@ -758,7 +758,7 @@ void THTensor_(bitand)(THTensor *r_, THTensor *t, real value)
 void THTensor_(bitor)(THTensor *r_, THTensor *t, real value)
 {
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
-  return THError("bitor is only supported for integer type tensors");
+   THError("bitor is only supported for integer type tensors");
 #else
   THTensor_(resizeAs)(r_, t);
   if (THTensor_(isContiguous)(r_) &&
@@ -781,7 +781,7 @@ void THTensor_(bitor)(THTensor *r_, THTensor *t, real value)
 void THTensor_(bitxor)(THTensor *r_, THTensor *t, real value)
 {
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
-  return THError("bitxor is only supported for integer type tensors");
+   THError("bitxor is only supported for integer type tensors");
 #else
   THTensor_(resizeAs)(r_, t);
   if (THTensor_(isContiguous)(r_) &&
@@ -1016,7 +1016,7 @@ void THTensor_(cremainder)(THTensor *r_, THTensor *t, THTensor *src)
 void THTensor_(cbitand)(THTensor *r_, THTensor *t, THTensor *src)
 {
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
-  return THError("cbitand is only supported for integer type tensors");
+   THError("cbitand is only supported for integer type tensors");
 #else
   THTensor_(resizeAs)(r_, t);
   if (THTensor_(isContiguous)(r_) &&
@@ -1041,7 +1041,7 @@ void THTensor_(cbitand)(THTensor *r_, THTensor *t, THTensor *src)
 void THTensor_(cbitor)(THTensor *r_, THTensor *t, THTensor *src)
 {
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
-  return THError("cbitor is only supported for integer type tensors");
+   THError("cbitor is only supported for integer type tensors");
 #else
   THTensor_(resizeAs)(r_, t);
   if (THTensor_(isContiguous)(r_) &&
@@ -1066,7 +1066,7 @@ void THTensor_(cbitor)(THTensor *r_, THTensor *t, THTensor *src)
 void THTensor_(cbitxor)(THTensor *r_, THTensor *t, THTensor *src)
 {
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
-  return THError("cbitxor is only supported for integer type tensors");
+   THError("cbitxor is only supported for integer type tensors");
 #else
   THTensor_(resizeAs)(r_, t);
   if (THTensor_(isContiguous)(r_) &&

From 2985240d70a29d397b4bbc8d2a1f9cc672efdca9 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Wed, 24 May 2017 14:14:12 -0400
Subject: [PATCH 53/71] pass MKL_ILP64 environment variable to cmake

---
 lib/TH/cmake/FindMKL.cmake | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/TH/cmake/FindMKL.cmake b/lib/TH/cmake/FindMKL.cmake
index 98b485948..0a4dfb83f 100644
--- a/lib/TH/cmake/FindMKL.cmake
+++ b/lib/TH/cmake/FindMKL.cmake
@@ -33,6 +33,8 @@ SET(INTEL_COMPILER_DIR $ENV{INTEL_COMPILER_DIR} CACHE STRING
   "Root directory of the Intel Compiler Suite (contains ipp, mkl, etc.)")
 SET(INTEL_MKL_DIR $ENV{INTEL_MKL_DIR} CACHE STRING
   "Root directory of the Intel MKL (standalone)")
+SET(MKL_ILP64 $ENV{MKL_ILP64} CACHE STRING
+  "Link with 64bit-interger version of MKL (_ilp64 instead of _lp64)")
 SET(INTEL_MKL_SEQUENTIAL OFF CACHE BOOL
   "Force using the sequential (non threaded) libraries")
 

From 4bce3f433770a161e1ca995fe09c0dd2547598b2 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Wed, 24 May 2017 14:39:38 -0400
Subject: [PATCH 54/71] findmkl .cmake typo

---
 lib/TH/cmake/FindMKL.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/TH/cmake/FindMKL.cmake b/lib/TH/cmake/FindMKL.cmake
index 0a4dfb83f..4255e632b 100644
--- a/lib/TH/cmake/FindMKL.cmake
+++ b/lib/TH/cmake/FindMKL.cmake
@@ -283,9 +283,9 @@ IF(NOT MKL_FIND_QUIETLY)
   IF(MKL_FOUND)
 	IF (mkl64s)
 	  MESSAGE(STATUS "MKL 64bit library found")
-	ELSE(mkl64)
+	ELSE(mkl64s)
 	  MESSAGE(STATUS "MKL 32bit library found")
-	ENDIF(mkl64)
+	ENDIF(mkl64s)
   ELSE(MKL_FOUND)
     MESSAGE(STATUS "MKL library not found")
   ENDIF(MKL_FOUND)

From f7e397e60ec7f3df73586bc052fd8fc884485611 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Wed, 24 May 2017 16:49:53 -0400
Subject: [PATCH 55/71] findblas.cmake add MKL_ILP64

---
 lib/TH/cmake/FindBLAS.cmake | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/lib/TH/cmake/FindBLAS.cmake b/lib/TH/cmake/FindBLAS.cmake
index ded8d5825..ca88be3f8 100644
--- a/lib/TH/cmake/FindBLAS.cmake
+++ b/lib/TH/cmake/FindBLAS.cmake
@@ -242,16 +242,23 @@ endif()
 # Determine if blas was compiled with the f2c conventions
 IF (BLAS_LIBRARIES)
   SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
+  IF (MKL_ILP64)
+	SET(CMAKE_REQUIRED_DEFINITIONS -DMKL_ILP64)
+  ENDIF(MKL_ILP64)
   
   CHECK_C_SOURCE_RUNS("
 #include <stdlib.h>
 #include <stdio.h>
 float x[4] = { 1, 2, 3, 4 };
 float y[4] = { .1, .01, .001, .0001 };
-#ifdef WIN32
-  typedef __int64 BLINT;
+#ifdef MKL_ILP64
+  #if (!defined(__INTEL_COMPILER)) & defined(_MSC_VER) 
+    typedef BLAS_INT __int64 
+  #else
+    typedef BLAS_INT long long int
+ #endif
 #else
-  typedef long BLINT;
+  typedef BLAS_INT int
 #endif
 BLINT four = 4;
 BLINT one = 1;
@@ -266,10 +273,14 @@ int main() {
 #include <stdio.h>
 float x[4] = { 1, 2, 3, 4 };
 float y[4] = { .1, .01, .001, .0001 };
-#ifdef WIN32
-  typedef __int64 BLINT;
+#ifdef MKL_ILP64
+  #if (!defined(__INTEL_COMPILER)) & defined(_MSC_VER) 
+    typedef BLAS_INT __int64 
+  #else
+    typedef BLAS_INT long long int
+ #endif
 #else
-  typedef long BLINT;
+  typedef BLAS_INT int
 #endif
 BLINT four = 4;
 BLINT one = 1;

From 95d02cc0d5b6a9f42e044536dd41007da3acbe69 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Wed, 24 May 2017 16:59:05 -0400
Subject: [PATCH 56/71] fix typo

---
 lib/TH/cmake/FindBLAS.cmake | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/lib/TH/cmake/FindBLAS.cmake b/lib/TH/cmake/FindBLAS.cmake
index ca88be3f8..51f1a2523 100644
--- a/lib/TH/cmake/FindBLAS.cmake
+++ b/lib/TH/cmake/FindBLAS.cmake
@@ -260,8 +260,8 @@ float y[4] = { .1, .01, .001, .0001 };
 #else
   typedef BLAS_INT int
 #endif
-BLINT four = 4;
-BLINT one = 1;
+BLAS_INT four = 4;
+BLAS_INT one = 1;
 extern double sdot_();
 int main() {
   double r = sdot_(&four, x, &one, y, &one);
@@ -282,11 +282,10 @@ float y[4] = { .1, .01, .001, .0001 };
 #else
   typedef BLAS_INT int
 #endif
-BLINT four = 4;
-BLINT one = 1;
+BLAS_INT four = 4;
+BLAS_INT one = 1;
 extern float sdot_();
 int main() {
-  int i;
   double r = sdot_(&four, x, &one, y, &one);
   exit((float)r != (float).1234);
 }" BLAS_F2C_FLOAT_WORKS )

From b3b2cf394c09ef22ea2fec914c813fbe6dc43d09 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Wed, 24 May 2017 17:13:15 -0400
Subject: [PATCH 57/71] add compile to cmake f2c test

---
 lib/TH/cmake/FindBLAS.cmake | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/lib/TH/cmake/FindBLAS.cmake b/lib/TH/cmake/FindBLAS.cmake
index 51f1a2523..f0166cd87 100644
--- a/lib/TH/cmake/FindBLAS.cmake
+++ b/lib/TH/cmake/FindBLAS.cmake
@@ -246,7 +246,7 @@ IF (BLAS_LIBRARIES)
 	SET(CMAKE_REQUIRED_DEFINITIONS -DMKL_ILP64)
   ENDIF(MKL_ILP64)
   
-  CHECK_C_SOURCE_RUNS("
+  set(f2c_code_d "
 #include <stdlib.h>
 #include <stdio.h>
 float x[4] = { 1, 2, 3, 4 };
@@ -266,9 +266,16 @@ extern double sdot_();
 int main() {
   double r = sdot_(&four, x, &one, y, &one);
   exit((float)r != (float).1234);
-}" BLAS_F2C_DOUBLE_WORKS )
+}" )
 
-  CHECK_C_SOURCE_RUNS("
+  CHECK_C_SOURCE_COMPILES(${f2c_code_d} BLAS_F2C_DOUBLE_COMPILES )
+  IF (NOT BLAS_F2C_DOUBLE_COMPILES)
+    MESSAGE(STATUS "Warning F2C double check did not compile!!")
+  ENDIF(NOT BLAS_F2C_DOUBLE_COMPILES)
+  
+  CHECK_C_SOURCE_RUNS(${f2c_code_d} BLAS_F2C_DOUBLE_WORKS )
+
+  set(f2c_code_f "
 #include <stdlib.h>
 #include <stdio.h>
 float x[4] = { 1, 2, 3, 4 };
@@ -288,7 +295,14 @@ extern float sdot_();
 int main() {
   double r = sdot_(&four, x, &one, y, &one);
   exit((float)r != (float).1234);
-}" BLAS_F2C_FLOAT_WORKS )
+}" )
+
+  CHECK_C_SOURCE_COMPILES(${f2c_code_f} BLAS_F2C_FLOAT_COMPILES )
+  IF (NOT BLAS_F2C_FLOAT_COMPILES)
+    MESSAGE(STATUS "Warning F2C float check did not compile!!")
+  ENDIF(NOT BLAS_F2C_FLOAT_COMPILES)
+  
+  CHECK_C_SOURCE_RUNS(${f2c_code_f} BLAS_F2C_FLOAT_WORKS )
 
   IF (BLAS_F2C_DOUBLE_WORKS AND NOT BLAS_F2C_FLOAT_WORKS)
     MESSAGE(STATUS "This BLAS uses the F2C return conventions")

From 6782b83f3f33b1c9088dcfbf1a5c94027ff94164 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Wed, 24 May 2017 17:30:39 -0400
Subject: [PATCH 58/71] debug

---
 lib/TH/cmake/FindBLAS.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/TH/cmake/FindBLAS.cmake b/lib/TH/cmake/FindBLAS.cmake
index f0166cd87..5b90b2320 100644
--- a/lib/TH/cmake/FindBLAS.cmake
+++ b/lib/TH/cmake/FindBLAS.cmake
@@ -245,6 +245,7 @@ IF (BLAS_LIBRARIES)
   IF (MKL_ILP64)
 	SET(CMAKE_REQUIRED_DEFINITIONS -DMKL_ILP64)
   ENDIF(MKL_ILP64)
+  SET(CMAKE_VERBOSE_MAKEFILE TRUE)
   
   set(f2c_code_d "
 #include <stdlib.h>

From d5b1f60506f47beba5deb6cc268979632ea84950 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Wed, 24 May 2017 17:34:06 -0400
Subject: [PATCH 59/71] debug

---
 lib/TH/cmake/FindBLAS.cmake | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lib/TH/cmake/FindBLAS.cmake b/lib/TH/cmake/FindBLAS.cmake
index 5b90b2320..ef011fb43 100644
--- a/lib/TH/cmake/FindBLAS.cmake
+++ b/lib/TH/cmake/FindBLAS.cmake
@@ -245,7 +245,6 @@ IF (BLAS_LIBRARIES)
   IF (MKL_ILP64)
 	SET(CMAKE_REQUIRED_DEFINITIONS -DMKL_ILP64)
   ENDIF(MKL_ILP64)
-  SET(CMAKE_VERBOSE_MAKEFILE TRUE)
   
   set(f2c_code_d "
 #include <stdlib.h>
@@ -256,7 +255,7 @@ float y[4] = { .1, .01, .001, .0001 };
   #if (!defined(__INTEL_COMPILER)) & defined(_MSC_VER) 
     typedef BLAS_INT __int64 
   #else
-    typedef BLAS_INT long long int
+    typedef BLAS_INT long long
  #endif
 #else
   typedef BLAS_INT int
@@ -285,7 +284,7 @@ float y[4] = { .1, .01, .001, .0001 };
   #if (!defined(__INTEL_COMPILER)) & defined(_MSC_VER) 
     typedef BLAS_INT __int64 
   #else
-    typedef BLAS_INT long long int
+    typedef BLAS_INT long long
  #endif
 #else
   typedef BLAS_INT int

From 4a6a02dc29b90a945a43ee7b8bad87fb76b3477e Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Wed, 24 May 2017 17:37:09 -0400
Subject: [PATCH 60/71] debug

---
 lib/TH/cmake/FindBLAS.cmake | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/TH/cmake/FindBLAS.cmake b/lib/TH/cmake/FindBLAS.cmake
index ef011fb43..eece2693f 100644
--- a/lib/TH/cmake/FindBLAS.cmake
+++ b/lib/TH/cmake/FindBLAS.cmake
@@ -253,12 +253,12 @@ float x[4] = { 1, 2, 3, 4 };
 float y[4] = { .1, .01, .001, .0001 };
 #ifdef MKL_ILP64
   #if (!defined(__INTEL_COMPILER)) & defined(_MSC_VER) 
-    typedef BLAS_INT __int64 
+    typedef BLAS_INT __int64; 
   #else
-    typedef BLAS_INT long long
+    typedef BLAS_INT long long;
  #endif
 #else
-  typedef BLAS_INT int
+  typedef BLAS_INT int;
 #endif
 BLAS_INT four = 4;
 BLAS_INT one = 1;
@@ -282,12 +282,12 @@ float x[4] = { 1, 2, 3, 4 };
 float y[4] = { .1, .01, .001, .0001 };
 #ifdef MKL_ILP64
   #if (!defined(__INTEL_COMPILER)) & defined(_MSC_VER) 
-    typedef BLAS_INT __int64 
+    typedef BLAS_INT __int64;
   #else
-    typedef BLAS_INT long long
+    typedef BLAS_INT long long;
  #endif
 #else
-  typedef BLAS_INT int
+  typedef BLAS_INT int;
 #endif
 BLAS_INT four = 4;
 BLAS_INT one = 1;

From 04c68994894ae59913e612bedc6197fd1bfac4a9 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Wed, 24 May 2017 17:45:08 -0400
Subject: [PATCH 61/71] debug

---
 lib/TH/cmake/FindBLAS.cmake | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/TH/cmake/FindBLAS.cmake b/lib/TH/cmake/FindBLAS.cmake
index eece2693f..ffa0ce21e 100644
--- a/lib/TH/cmake/FindBLAS.cmake
+++ b/lib/TH/cmake/FindBLAS.cmake
@@ -253,9 +253,9 @@ float x[4] = { 1, 2, 3, 4 };
 float y[4] = { .1, .01, .001, .0001 };
 #ifdef MKL_ILP64
   #if (!defined(__INTEL_COMPILER)) & defined(_MSC_VER) 
-    typedef BLAS_INT __int64; 
+    #define BLAS_INT __int64; 
   #else
-    typedef BLAS_INT long long;
+    #define BLAS_INT long long;
  #endif
 #else
   typedef BLAS_INT int;
@@ -282,12 +282,12 @@ float x[4] = { 1, 2, 3, 4 };
 float y[4] = { .1, .01, .001, .0001 };
 #ifdef MKL_ILP64
   #if (!defined(__INTEL_COMPILER)) & defined(_MSC_VER) 
-    typedef BLAS_INT __int64;
+    #define BLAS_INT __int64;
   #else
-    typedef BLAS_INT long long;
+    #define BLAS_INT long long;
  #endif
 #else
-  typedef BLAS_INT int;
+  #define BLAS_INT int;
 #endif
 BLAS_INT four = 4;
 BLAS_INT one = 1;

From 942b6835666faf849bfffa25b90785895315a029 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Wed, 24 May 2017 17:47:51 -0400
Subject: [PATCH 62/71] debug

---
 lib/TH/cmake/FindBLAS.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/TH/cmake/FindBLAS.cmake b/lib/TH/cmake/FindBLAS.cmake
index ffa0ce21e..e8af71d3e 100644
--- a/lib/TH/cmake/FindBLAS.cmake
+++ b/lib/TH/cmake/FindBLAS.cmake
@@ -271,6 +271,7 @@ int main() {
   CHECK_C_SOURCE_COMPILES(${f2c_code_d} BLAS_F2C_DOUBLE_COMPILES )
   IF (NOT BLAS_F2C_DOUBLE_COMPILES)
     MESSAGE(STATUS "Warning F2C double check did not compile!!")
+	MESSAGE(STATUS ${f2c_code_d})
   ENDIF(NOT BLAS_F2C_DOUBLE_COMPILES)
   
   CHECK_C_SOURCE_RUNS(${f2c_code_d} BLAS_F2C_DOUBLE_WORKS )

From aaec8bb0056a61146f38c4438023d8415257da5a Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Wed, 24 May 2017 17:56:16 -0400
Subject: [PATCH 63/71] debug

---
 lib/TH/cmake/FindBLAS.cmake | 80 ++++++++++++++++++-------------------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/lib/TH/cmake/FindBLAS.cmake b/lib/TH/cmake/FindBLAS.cmake
index e8af71d3e..3681e1338 100644
--- a/lib/TH/cmake/FindBLAS.cmake
+++ b/lib/TH/cmake/FindBLAS.cmake
@@ -246,26 +246,26 @@ IF (BLAS_LIBRARIES)
 	SET(CMAKE_REQUIRED_DEFINITIONS -DMKL_ILP64)
   ENDIF(MKL_ILP64)
   
-  set(f2c_code_d "
-#include <stdlib.h>
-#include <stdio.h>
-float x[4] = { 1, 2, 3, 4 };
-float y[4] = { .1, .01, .001, .0001 };
-#ifdef MKL_ILP64
-  #if (!defined(__INTEL_COMPILER)) & defined(_MSC_VER) 
-    #define BLAS_INT __int64; 
-  #else
-    #define BLAS_INT long long;
- #endif
-#else
-  typedef BLAS_INT int;
-#endif
-BLAS_INT four = 4;
-BLAS_INT one = 1;
-extern double sdot_();
-int main() {
-  double r = sdot_(&four, x, &one, y, &one);
-  exit((float)r != (float).1234);
+  set(f2c_code_d "\
+#include <stdlib.h>\
+#include <stdio.h>\
+float x[4] = { 1, 2, 3, 4 };\
+float y[4] = { .1, .01, .001, .0001 };\
+#ifdef MKL_ILP64 \
+  #if (!defined(__INTEL_COMPILER)) & defined(_MSC_VER) \
+    #define BLAS_INT __int64; \
+  #else \
+    #define BLAS_INT long long; \
+ #endif \
+#else \
+  #define BLAS_INT int; \
+#endif \
+BLAS_INT four = 4; \
+BLAS_INT one = 1; \
+extern double sdot_(); \
+int main() { \
+  double r = sdot_(&four, x, &one, y, &one); \
+  exit((float)r != (float).1234); \
 }" )
 
   CHECK_C_SOURCE_COMPILES(${f2c_code_d} BLAS_F2C_DOUBLE_COMPILES )
@@ -276,26 +276,26 @@ int main() {
   
   CHECK_C_SOURCE_RUNS(${f2c_code_d} BLAS_F2C_DOUBLE_WORKS )
 
-  set(f2c_code_f "
-#include <stdlib.h>
-#include <stdio.h>
-float x[4] = { 1, 2, 3, 4 };
-float y[4] = { .1, .01, .001, .0001 };
-#ifdef MKL_ILP64
-  #if (!defined(__INTEL_COMPILER)) & defined(_MSC_VER) 
-    #define BLAS_INT __int64;
-  #else
-    #define BLAS_INT long long;
- #endif
-#else
-  #define BLAS_INT int;
-#endif
-BLAS_INT four = 4;
-BLAS_INT one = 1;
-extern float sdot_();
-int main() {
-  double r = sdot_(&four, x, &one, y, &one);
-  exit((float)r != (float).1234);
+  set(f2c_code_f "\
+#include <stdlib.h>\
+#include <stdio.h>\
+float x[4] = { 1, 2, 3, 4 };\
+float y[4] = { .1, .01, .001, .0001 };\
+#ifdef MKL_ILP64\
+  #if (!defined(__INTEL_COMPILER)) & defined(_MSC_VER) \
+    #define BLAS_INT __int64;\
+  #else\
+    #define BLAS_INT long long;\
+ #endif\
+#else\
+  #define BLAS_INT int;\
+#endif\
+BLAS_INT four = 4;\
+BLAS_INT one = 1;\
+extern float sdot_();\
+int main() {\
+  double r = sdot_(&four, x, &one, y, &one);\
+  exit((float)r != (float).1234);\
 }" )
 
   CHECK_C_SOURCE_COMPILES(${f2c_code_f} BLAS_F2C_FLOAT_COMPILES )

From 4f690e7adbfd24e818f2368ddd17f0884db23a9c Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Wed, 24 May 2017 18:02:10 -0400
Subject: [PATCH 64/71] debug

---
 lib/TH/cmake/FindBLAS.cmake | 90 ++++++++++++++++++-------------------
 1 file changed, 45 insertions(+), 45 deletions(-)

diff --git a/lib/TH/cmake/FindBLAS.cmake b/lib/TH/cmake/FindBLAS.cmake
index 3681e1338..3414be0f9 100644
--- a/lib/TH/cmake/FindBLAS.cmake
+++ b/lib/TH/cmake/FindBLAS.cmake
@@ -246,64 +246,64 @@ IF (BLAS_LIBRARIES)
 	SET(CMAKE_REQUIRED_DEFINITIONS -DMKL_ILP64)
   ENDIF(MKL_ILP64)
   
-  set(f2c_code_d "\
-#include <stdlib.h>\
-#include <stdio.h>\
-float x[4] = { 1, 2, 3, 4 };\
-float y[4] = { .1, .01, .001, .0001 };\
-#ifdef MKL_ILP64 \
-  #if (!defined(__INTEL_COMPILER)) & defined(_MSC_VER) \
-    #define BLAS_INT __int64; \
-  #else \
-    #define BLAS_INT long long; \
- #endif \
-#else \
-  #define BLAS_INT int; \
-#endif \
-BLAS_INT four = 4; \
-BLAS_INT one = 1; \
-extern double sdot_(); \
-int main() { \
-  double r = sdot_(&four, x, &one, y, &one); \
-  exit((float)r != (float).1234); \
+  set(f2c_code_d "
+#include <stdlib.h>
+#include <stdio.h>
+float x[4] = { 1, 2, 3, 4 };
+float y[4] = { .1, .01, .001, .0001 };
+#ifdef MKL_ILP64
+  #if (!defined(__INTEL_COMPILER)) & defined(_MSC_VER) 
+    #define BLAS_INT __int64; 
+  #else
+    #define BLAS_INT long long;
+ #endif
+#else
+  typedef BLAS_INT int;
+#endif
+BLAS_INT four = 4;
+BLAS_INT one = 1;
+extern double sdot_();
+int main() {
+  double r = sdot_(&four, x, &one, y, &one);
+  exit((float)r != (float).1234);
 }" )
 
-  CHECK_C_SOURCE_COMPILES(${f2c_code_d} BLAS_F2C_DOUBLE_COMPILES )
+  CHECK_C_SOURCE_COMPILES("${f2c_code_d}" BLAS_F2C_DOUBLE_COMPILES )
   IF (NOT BLAS_F2C_DOUBLE_COMPILES)
     MESSAGE(STATUS "Warning F2C double check did not compile!!")
-	MESSAGE(STATUS ${f2c_code_d})
+	MESSAGE(STATUS "${f2c_code_d}")
   ENDIF(NOT BLAS_F2C_DOUBLE_COMPILES)
   
-  CHECK_C_SOURCE_RUNS(${f2c_code_d} BLAS_F2C_DOUBLE_WORKS )
+  CHECK_C_SOURCE_RUNS("${f2c_code_d}" BLAS_F2C_DOUBLE_WORKS )
 
-  set(f2c_code_f "\
-#include <stdlib.h>\
-#include <stdio.h>\
-float x[4] = { 1, 2, 3, 4 };\
-float y[4] = { .1, .01, .001, .0001 };\
-#ifdef MKL_ILP64\
-  #if (!defined(__INTEL_COMPILER)) & defined(_MSC_VER) \
-    #define BLAS_INT __int64;\
-  #else\
-    #define BLAS_INT long long;\
- #endif\
-#else\
-  #define BLAS_INT int;\
-#endif\
-BLAS_INT four = 4;\
-BLAS_INT one = 1;\
-extern float sdot_();\
-int main() {\
-  double r = sdot_(&four, x, &one, y, &one);\
-  exit((float)r != (float).1234);\
+  set(f2c_code_f "
+#include <stdlib.h>
+#include <stdio.h>
+float x[4] = { 1, 2, 3, 4 };
+float y[4] = { .1, .01, .001, .0001 };
+#ifdef MKL_ILP64
+  #if (!defined(__INTEL_COMPILER)) & defined(_MSC_VER) 
+    #define BLAS_INT __int64;
+  #else
+    #define BLAS_INT long long;
+ #endif
+#else
+  #define BLAS_INT int;
+#endif
+BLAS_INT four = 4;
+BLAS_INT one = 1;
+extern float sdot_();
+int main() {
+  double r = sdot_(&four, x, &one, y, &one);
+  exit((float)r != (float).1234);
 }" )
 
-  CHECK_C_SOURCE_COMPILES(${f2c_code_f} BLAS_F2C_FLOAT_COMPILES )
+  CHECK_C_SOURCE_COMPILES("${f2c_code_f}" BLAS_F2C_FLOAT_COMPILES )
   IF (NOT BLAS_F2C_FLOAT_COMPILES)
     MESSAGE(STATUS "Warning F2C float check did not compile!!")
   ENDIF(NOT BLAS_F2C_FLOAT_COMPILES)
   
-  CHECK_C_SOURCE_RUNS(${f2c_code_f} BLAS_F2C_FLOAT_WORKS )
+  CHECK_C_SOURCE_RUNS("${f2c_code_f}" BLAS_F2C_FLOAT_WORKS )
 
   IF (BLAS_F2C_DOUBLE_WORKS AND NOT BLAS_F2C_FLOAT_WORKS)
     MESSAGE(STATUS "This BLAS uses the F2C return conventions")

From 05558b4158636593207a0b171eaec21ba31b8c19 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Wed, 24 May 2017 18:10:06 -0400
Subject: [PATCH 65/71] debug

---
 lib/TH/cmake/FindBLAS.cmake | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lib/TH/cmake/FindBLAS.cmake b/lib/TH/cmake/FindBLAS.cmake
index 3414be0f9..0236fb907 100644
--- a/lib/TH/cmake/FindBLAS.cmake
+++ b/lib/TH/cmake/FindBLAS.cmake
@@ -258,14 +258,15 @@ float y[4] = { .1, .01, .001, .0001 };
     #define BLAS_INT long long;
  #endif
 #else
-  typedef BLAS_INT int;
+  #define BLAS_INT int;
 #endif
 BLAS_INT four = 4;
 BLAS_INT one = 1;
 extern double sdot_();
 int main() {
   double r = sdot_(&four, x, &one, y, &one);
-  exit((float)r != (float).1234);
+  //exit((float)r != (float).1234);
+  exit(1);
 }" )
 
   CHECK_C_SOURCE_COMPILES("${f2c_code_d}" BLAS_F2C_DOUBLE_COMPILES )
@@ -295,7 +296,8 @@ BLAS_INT one = 1;
 extern float sdot_();
 int main() {
   double r = sdot_(&four, x, &one, y, &one);
-  exit((float)r != (float).1234);
+  //exit((float)r != (float).1234);
+  exit(0);
 }" )
 
   CHECK_C_SOURCE_COMPILES("${f2c_code_f}" BLAS_F2C_FLOAT_COMPILES )

From 23a2a0076e14844873cbc8624e5b0e31f2ef88ae Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Wed, 24 May 2017 18:12:00 -0400
Subject: [PATCH 66/71] debug

---
 lib/TH/cmake/FindBLAS.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/TH/cmake/FindBLAS.cmake b/lib/TH/cmake/FindBLAS.cmake
index 0236fb907..e0ca44731 100644
--- a/lib/TH/cmake/FindBLAS.cmake
+++ b/lib/TH/cmake/FindBLAS.cmake
@@ -264,7 +264,7 @@ BLAS_INT four = 4;
 BLAS_INT one = 1;
 extern double sdot_();
 int main() {
-  double r = sdot_(&four, x, &one, y, &one);
+  //double r = sdot_(&four, x, &one, y, &one);
   //exit((float)r != (float).1234);
   exit(1);
 }" )
@@ -295,7 +295,7 @@ BLAS_INT four = 4;
 BLAS_INT one = 1;
 extern float sdot_();
 int main() {
-  double r = sdot_(&four, x, &one, y, &one);
+  //double r = sdot_(&four, x, &one, y, &one);
   //exit((float)r != (float).1234);
   exit(0);
 }" )

From c0672d91331b3a085d14ba4bafe855c901f7f1d8 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Wed, 24 May 2017 18:16:58 -0400
Subject: [PATCH 67/71] debug

---
 lib/TH/cmake/FindBLAS.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/TH/cmake/FindBLAS.cmake b/lib/TH/cmake/FindBLAS.cmake
index e0ca44731..77b745a2a 100644
--- a/lib/TH/cmake/FindBLAS.cmake
+++ b/lib/TH/cmake/FindBLAS.cmake
@@ -244,6 +244,7 @@ IF (BLAS_LIBRARIES)
   SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
   IF (MKL_ILP64)
 	SET(CMAKE_REQUIRED_DEFINITIONS -DMKL_ILP64)
+	MESSAGE(STATUS "Checking F2C with MKL ILP64 ${CMAKE_REQUIRED_DEFINITIONS}")
   ENDIF(MKL_ILP64)
   
   set(f2c_code_d "

From 122f16c5ac10cdfa5c75f4019d9c7db02ef3f336 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Wed, 24 May 2017 18:29:09 -0400
Subject: [PATCH 68/71] debug

---
 lib/TH/cmake/FindBLAS.cmake | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/lib/TH/cmake/FindBLAS.cmake b/lib/TH/cmake/FindBLAS.cmake
index 77b745a2a..0d9dbcd55 100644
--- a/lib/TH/cmake/FindBLAS.cmake
+++ b/lib/TH/cmake/FindBLAS.cmake
@@ -263,11 +263,10 @@ float y[4] = { .1, .01, .001, .0001 };
 #endif
 BLAS_INT four = 4;
 BLAS_INT one = 1;
-extern double sdot_();
+extern double sdot();
 int main() {
-  //double r = sdot_(&four, x, &one, y, &one);
-  //exit((float)r != (float).1234);
-  exit(1);
+  double r = sdot(&four, x, &one, y, &one);
+  exit((float)r != (float).1234);
 }" )
 
   CHECK_C_SOURCE_COMPILES("${f2c_code_d}" BLAS_F2C_DOUBLE_COMPILES )
@@ -294,11 +293,10 @@ float y[4] = { .1, .01, .001, .0001 };
 #endif
 BLAS_INT four = 4;
 BLAS_INT one = 1;
-extern float sdot_();
+extern float sdot();
 int main() {
-  //double r = sdot_(&four, x, &one, y, &one);
-  //exit((float)r != (float).1234);
-  exit(0);
+  double r = sdot(&four, x, &one, y, &one);
+  exit((float)r != (float).1234);
 }" )
 
   CHECK_C_SOURCE_COMPILES("${f2c_code_f}" BLAS_F2C_FLOAT_COMPILES )

From 1a15395f3d6933d8acf57b1ff1e74a0ac17a022f Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Thu, 25 May 2017 10:13:16 -0400
Subject: [PATCH 69/71] remove ; in #define

---
 lib/TH/cmake/FindBLAS.cmake | 20 ++++++++++----------
 lib/TH/cmake/FindMKL.cmake  |  4 ++--
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/lib/TH/cmake/FindBLAS.cmake b/lib/TH/cmake/FindBLAS.cmake
index 0d9dbcd55..8384bb706 100644
--- a/lib/TH/cmake/FindBLAS.cmake
+++ b/lib/TH/cmake/FindBLAS.cmake
@@ -254,18 +254,18 @@ float x[4] = { 1, 2, 3, 4 };
 float y[4] = { .1, .01, .001, .0001 };
 #ifdef MKL_ILP64
   #if (!defined(__INTEL_COMPILER)) & defined(_MSC_VER) 
-    #define BLAS_INT __int64; 
+    #define BLAS_INT __int64
   #else
-    #define BLAS_INT long long;
+    #define BLAS_INT long long
  #endif
 #else
-  #define BLAS_INT int;
+  #define BLAS_INT int
 #endif
 BLAS_INT four = 4;
 BLAS_INT one = 1;
-extern double sdot();
+extern double sdot_();
 int main() {
-  double r = sdot(&four, x, &one, y, &one);
+  double r = sdot_(&four, x, &one, y, &one);
   exit((float)r != (float).1234);
 }" )
 
@@ -284,18 +284,18 @@ float x[4] = { 1, 2, 3, 4 };
 float y[4] = { .1, .01, .001, .0001 };
 #ifdef MKL_ILP64
   #if (!defined(__INTEL_COMPILER)) & defined(_MSC_VER) 
-    #define BLAS_INT __int64;
+    #define BLAS_INT __int64
   #else
-    #define BLAS_INT long long;
+    #define BLAS_INT long long
  #endif
 #else
-  #define BLAS_INT int;
+  #define BLAS_INT int
 #endif
 BLAS_INT four = 4;
 BLAS_INT one = 1;
-extern float sdot();
+extern float sdot_();
 int main() {
-  double r = sdot(&four, x, &one, y, &one);
+  double r = sdot_(&four, x, &one, y, &one);
   exit((float)r != (float).1234);
 }" )
 
diff --git a/lib/TH/cmake/FindMKL.cmake b/lib/TH/cmake/FindMKL.cmake
index 4255e632b..d949dee70 100644
--- a/lib/TH/cmake/FindMKL.cmake
+++ b/lib/TH/cmake/FindMKL.cmake
@@ -282,9 +282,9 @@ ENDIF(NOT MKL_FOUND AND MKL_FIND_REQUIRED)
 IF(NOT MKL_FIND_QUIETLY)
   IF(MKL_FOUND)
 	IF (mkl64s)
-	  MESSAGE(STATUS "MKL 64bit library found")
+	  MESSAGE(STATUS "MKL 64bit library found: ${mkl64s}")
 	ELSE(mkl64s)
-	  MESSAGE(STATUS "MKL 32bit library found")
+	  MESSAGE(STATUS "MKL 32bit library found: ${mkl64s}")
 	ENDIF(mkl64s)
   ELSE(MKL_FOUND)
     MESSAGE(STATUS "MKL library not found")

From 99efc93373ad8455ec3f09710fce6474b97461f7 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Thu, 1 Jun 2017 18:34:57 -0400
Subject: [PATCH 70/71] merge from torch master (#8)

* Fast transposed copy

* Add scatterAdd

* lua 5.3 changes and gcc constants

* Adding support for ADD_TORCH_LIBRARY macro
---
 cmake/TorchPackage.cmake      | 33 ++++++++++-------
 generic/Tensor.c              |  8 ++--
 generic/TensorOperator.c      | 16 ++++----
 lib/TH/CMakeLists.txt         | 14 ++++---
 lib/TH/THDiskFile.c           |  3 ++
 lib/TH/generic/THTensorCopy.c | 70 ++++++++++++++++++++++++++++++++++-
 lib/TH/generic/THTensorMath.c | 29 +++++++++++++++
 lib/TH/generic/THTensorMath.h |  1 +
 lib/luaT/CMakeLists.txt       | 11 ++++--
 9 files changed, 152 insertions(+), 33 deletions(-)

diff --git a/cmake/TorchPackage.cmake b/cmake/TorchPackage.cmake
index 7fcbdff47..f966dacab 100644
--- a/cmake/TorchPackage.cmake
+++ b/cmake/TorchPackage.cmake
@@ -1,5 +1,21 @@
 # -*- cmake -*-
 
+MACRO(ADD_TORCH_LIBRARY package type src)
+  IF ("${type}" STREQUAL "STATIC")
+    if ("${src}" MATCHES "cu$" OR "${src}" MATCHES "cu;")
+      CUDA_ADD_LIBRARY(${package} STATIC ${src})
+    else()
+      ADD_LIBRARY(${package} STATIC ${src})
+    endif()
+  ELSE()
+    if ("${src}" MATCHES "cu$" OR "${src}" MATCHES "cu;")
+      CUDA_ADD_LIBRARY(${package} ${type} ${src})
+    else()
+      ADD_LIBRARY(${package} ${type} ${src})
+    endif()
+  ENDIF()
+ENDMACRO()
+
 MACRO(ADD_TORCH_PACKAGE package src luasrc)
   INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
   INCLUDE_DIRECTORIES(${Torch_LUA_INCLUDE_DIR})
@@ -8,17 +24,7 @@ MACRO(ADD_TORCH_PACKAGE package src luasrc)
  # As per CMake doc, macro arguments are not variables, so simple test syntax not working
   IF(NOT "${src}" STREQUAL "")
 
-    if ("${src}" MATCHES "cu$" OR "${src}" MATCHES "cu;")
-      CUDA_ADD_LIBRARY(${package} MODULE ${src})
-      if(BUILD_STATIC)
-        CUDA_ADD_LIBRARY(${package}_static STATIC ${src})
-      endif()
-    else()
-      ADD_LIBRARY(${package} MODULE ${src})
-      if(BUILD_STATIC)
-        ADD_LIBRARY(${package}_static STATIC ${src})
-      endif()
-    endif()
+    ADD_TORCH_LIBRARY(${package} MODULE "${src}")
 
     ### Torch packages supposes libraries prefix is "lib"
     SET_TARGET_PROPERTIES(${package} PROPERTIES
@@ -31,12 +37,13 @@ MACRO(ADD_TORCH_PACKAGE package src luasrc)
         LINK_FLAGS "-undefined dynamic_lookup")
     ENDIF()
 
-    if(BUILD_STATIC)
+    IF (BUILD_STATIC OR "$ENV{STATIC_TH}" STREQUAL "YES")
+      ADD_TORCH_LIBRARY(${package}_static STATIC "${src}")
       SET_TARGET_PROPERTIES(${package}_static PROPERTIES
         COMPILE_FLAGS "-fPIC")
       SET_TARGET_PROPERTIES(${package}_static PROPERTIES
         PREFIX "lib" IMPORT_PREFIX "lib" OUTPUT_NAME "${package}")
-    endif()
+    ENDIF()
 
     INSTALL(TARGETS ${package}
       RUNTIME DESTINATION ${Torch_INSTALL_LUA_CPATH_SUBDIR}
diff --git a/generic/Tensor.c b/generic/Tensor.c
index aabbbdc39..112a4bd63 100644
--- a/generic/Tensor.c
+++ b/generic/Tensor.c
@@ -142,7 +142,7 @@ static int torch_Tensor_(new)(lua_State *L)
           THTensor_(free)(tensor);
           THError("invalid element (not a number)");
         }
-        THStorage_(set)(THTensor_(storage)(tensor), si++, LUA_NUMBER_TO_REAL(lua_tonumber(L, -1)));
+        THStorage_(set)(THTensor_(storage)(tensor), si++, luaG_(checkreal)(L, -1));
         lua_pop(L, 1);
       }
 
@@ -1172,7 +1172,7 @@ static int torch_Tensor_(apply)(lua_State *L)
                   lua_call(L, 1, 1);
                   if(lua_isnumber(L, 3))
                   {
-                    *tensor_data = LUA_NUMBER_TO_REAL(lua_tonumber(L, 3));
+                    *tensor_data = luaG_(checkreal)(L, 3);
                     lua_pop(L, 1);
                   }
                   else if(lua_isnil(L, 3))
@@ -1198,7 +1198,7 @@ static int torch_Tensor_(map)(lua_State *L)
                   lua_call(L, 2, 1);
                   if(lua_isnumber(L, 4))
                   {
-                    *tensor_data = LUA_NUMBER_TO_REAL(lua_tonumber(L, 4));
+                    *tensor_data = luaG_(checkreal)(L, 4);
                     lua_pop(L, 1);
                   }
                   else if(lua_isnil(L, 4))
@@ -1226,7 +1226,7 @@ static int torch_Tensor_(map2)(lua_State *L)
                   lua_call(L, 3, 1);
                   if(lua_isnumber(L, 5))
                   {
-                    *tensor_data = LUA_NUMBER_TO_REAL(lua_tonumber(L, 5));
+                    *tensor_data = luaG_(checkreal)(L, 5);
                     lua_pop(L, 1);
                   }
                   else if(lua_isnil(L, 5))
diff --git a/generic/TensorOperator.c b/generic/TensorOperator.c
index e131c5733..37b2a0889 100644
--- a/generic/TensorOperator.c
+++ b/generic/TensorOperator.c
@@ -2,6 +2,8 @@
 #define TH_GENERIC_FILE "generic/TensorOperator.c"
 #else
 
+#include "luaG.h"
+
 static int torch_TensorOperator_(__add__)(lua_State *L)
 {
   THTensor *tensor1 = luaT_toudata(L, 1, torch_Tensor);
@@ -19,13 +21,13 @@ static int torch_TensorOperator_(__add__)(lua_State *L)
     {
       THTensor_(resizeAs)(r, tensor2);
       THTensor_(copy)(r, tensor2);
-      THTensor_(add)(r, r, luaL_checknumber(L, 1));
+      THTensor_(add)(r, r, luaG_(checkreal)(L, 1));
     }
     else if(tensor1 && !tensor2)
     {
       THTensor_(resizeAs)(r, tensor1);
       THTensor_(copy)(r, tensor1);
-      THTensor_(add)(r, r, luaL_checknumber(L, 2));
+      THTensor_(add)(r, r, luaG_(checkreal)(L, 2));
     }
     else
     {
@@ -53,14 +55,14 @@ static int torch_TensorOperator_(__sub__)(lua_State *L)
     if(!tensor1 && tensor2)
     {
       THTensor_(resizeAs)(r, tensor2);
-      THTensor_(fill)(r, luaL_checknumber(L, 1));
+      THTensor_(fill)(r, luaG_(checkreal)(L, 1));
       THTensor_(cadd)(r, r, -1, tensor2);
     }
     else if(tensor1 && !tensor2)
     {
       THTensor_(resizeAs)(r, tensor1);
       THTensor_(copy)(r, tensor1);
-      THTensor_(add)(r, r, -(real)luaL_checknumber(L, 2));
+      THTensor_(add)(r, r, -luaG_(checkreal)(L, 2));
     }
     else
     {
@@ -103,13 +105,13 @@ static int torch_TensorOperator_(__mul__)(lua_State *L)
     {
       THTensor_(resizeAs)(r, tensor2);
       THTensor_(copy)(r, tensor2);
-      THTensor_(mul)(r, r, luaL_checknumber(L, 1));
+      THTensor_(mul)(r, r, luaG_(checkreal)(L, 1));
     }
     else if(tensor1 && !tensor2)
     {
       THTensor_(resizeAs)(r, tensor1);
       THTensor_(copy)(r, tensor1);
-      THTensor_(mul)(r, r, luaL_checknumber(L, 2));
+      THTensor_(mul)(r, r, luaG_(checkreal)(L, 2));
     }
     else
     {
@@ -117,7 +119,7 @@ static int torch_TensorOperator_(__mul__)(lua_State *L)
       int dims = tensor2->nDimension;
 
       if(dimt == 1 && dims == 1)
-        lua_pushnumber(L, THTensor_(dot)(tensor1, tensor2)); /* ok, we wasted r, but who cares */
+        luaG_(pushreal)(L, THTensor_(dot)(tensor1, tensor2)); /* ok, we wasted r, but who cares */
       else if(dimt == 2 && dims == 1)
       {
         THTensor_(resize1d)(r, tensor1->size[0]);
diff --git a/lib/TH/CMakeLists.txt b/lib/TH/CMakeLists.txt
index c4e6694f7..b481a8a73 100644
--- a/lib/TH/CMakeLists.txt
+++ b/lib/TH/CMakeLists.txt
@@ -145,7 +145,6 @@ IF(C_AVX2_FOUND)
   SET(CMAKE_C_FLAGS "-DUSE_AVX2 ${CMAKE_C_FLAGS}")
 ENDIF(C_AVX2_FOUND)
 
-
 CHECK_C_SOURCE_RUNS("
 #include <stdatomic.h>
 int main()
@@ -243,10 +242,15 @@ SET(src ${src} ${hdr} ${simd})
 ##### build section
 ######################################################################
 
-ADD_LIBRARY(TH SHARED ${src})
-if(BUILD_STATIC)
-  ADD_LIBRARY(TH_static STATIC ${src})
-endif()
+ADD_TORCH_LIBRARY(TH SHARED "${src}")
+
+IF (BUILD_STATIC OR "$ENV{STATIC_TH}" STREQUAL "YES")
+  ADD_TORCH_LIBRARY(TH_static STATIC "${src}")
+  SET_TARGET_PROPERTIES(TH_static PROPERTIES
+    COMPILE_FLAGS "-fPIC")
+  SET_TARGET_PROPERTIES(TH_static PROPERTIES
+    PREFIX "lib" IMPORT_PREFIX "lib" OUTPUT_NAME "TH")
+ENDIF()
 
 IF(NOT TH_SO_VERSION)
   SET(TH_SO_VERSION 0)
diff --git a/lib/TH/THDiskFile.c b/lib/TH/THDiskFile.c
index 01b195131..3f57b3b35 100644
--- a/lib/TH/THDiskFile.c
+++ b/lib/TH/THDiskFile.c
@@ -3,6 +3,9 @@
 #include "THFilePrivate.h"
 
 #include <stdint.h>
+#ifndef LLONG_MAX
+#define LLONG_MAX 9223372036854775807LL
+#endif
 
 typedef struct THDiskFile__
 {
diff --git a/lib/TH/generic/THTensorCopy.c b/lib/TH/generic/THTensorCopy.c
index e90972870..71ccfdd0f 100644
--- a/lib/TH/generic/THTensorCopy.c
+++ b/lib/TH/generic/THTensorCopy.c
@@ -2,6 +2,70 @@
 #define TH_GENERIC_FILE "generic/THTensorCopy.c"
 #else
 
+int THTensor_(copyTransposeValid)(THTensor *tensor, THTensor *src) {
+  const int MIN_SZ = 60 * 60;
+  return THTensor_(isContiguous)(tensor) &&
+         THTensor_(nDimension)(src) == 2 &&
+         THTensor_(stride)(src, 0) == 1 &&
+         THTensor_(stride)(src, 1) == THTensor_(size)(src, 0) &&
+         THTensor_(nElement)(tensor) >= MIN_SZ;
+}
+
+// special case copy where tensor is contiguous and src is a transposed matrix
+// This can be generalized to most copies, but it's tricker
+void THTensor_(copyTranspose)(THTensor *tensor, THTensor *src) {
+  #define MIN(x, y) (((x) < (y)) ? (x) : (y))
+  #define MAX(x, y) (((x) > (y)) ? (x) : (y))
+
+#ifdef TH_REAL_IS_BYTE
+  const int BLOCK_SZ = 120;
+#else
+  const int BLOCK_SZ = 60;
+#endif
+
+  THTensor *buf = THTensor_(newWithSize2d)(BLOCK_SZ, BLOCK_SZ);
+  real *sp = THTensor_(data)(src);
+  real *rp = THTensor_(data)(tensor);
+  real *bp = THTensor_(data)(buf);
+
+  long NR = THTensor_(size)(src, 0);
+  long NC = THTensor_(size)(src, 1);
+  for (long R = 0; R < NR; R += BLOCK_SZ) {
+    for (long C = 0; C < NC; C += BLOCK_SZ) {
+      real *spo = sp + R + C * NR;
+      real *rpo = rp + C + R * NC;
+
+      int nr = MIN(NR - R, BLOCK_SZ);
+      int nc = MIN(NC - C, BLOCK_SZ);
+
+      // 1. copy columns from src to buf
+      for (int c = 0; c < nc; c++) {
+        memcpy(bp + c * BLOCK_SZ, spo + c * NR, nr * sizeof(real));
+      }
+
+      // 2. transpose buf in place
+      int rc_max = MAX(nr, nc);
+      int rc_min = MIN(nr, nc);
+      for (int r = 0; r < rc_max; r++) {
+        int end = MIN(r, rc_min);
+        for (int c = 0; c < end; c++) {
+          real tmp = bp[r + BLOCK_SZ * c];
+          bp[r + BLOCK_SZ * c] = bp[r * BLOCK_SZ + c];
+          bp[r * BLOCK_SZ + c] = tmp;
+        }
+      }
+
+      // 3. copy rows from buf to dst
+      for (int r = 0; r < nr; r++) {
+        memcpy(rpo + r * NC, bp + r * BLOCK_SZ, nc * sizeof(real));
+      }
+    }
+  }
+  THTensor_(free)(buf);
+  #undef MIN
+  #undef MAX
+}
+
 void THTensor_(copy)(THTensor *tensor, THTensor *src)
 {
   if (THTensor_(isContiguous)(tensor) && THTensor_(isContiguous)(src) && THTensor_(nElement)(tensor) == THTensor_(nElement)(src)) {
@@ -9,9 +73,13 @@ void THTensor_(copy)(THTensor *tensor, THTensor *src)
     real *rp = THTensor_(data)(tensor);
     ptrdiff_t sz = THTensor_(nElement)(tensor);
 #ifndef TH_REAL_IS_HALF
-    THVector_(copy)(rp, sp, sz); 
+    THVector_(copy)(rp, sp, sz);
 #else
     memcpy(rp, sp, sz * sizeof(real));
+#endif
+#ifndef TH_REAL_IS_HALF
+  } else if (THTensor_(copyTransposeValid)(tensor, src)) {
+    THTensor_(copyTranspose)(tensor, src);
 #endif
   } else {
     TH_TENSOR_APPLY2(real, tensor, real, src, *tensor_data = *src_data;)
diff --git a/lib/TH/generic/THTensorMath.c b/lib/TH/generic/THTensorMath.c
index 69ee4a8e4..68e208ce2 100644
--- a/lib/TH/generic/THTensorMath.c
+++ b/lib/TH/generic/THTensorMath.c
@@ -2,6 +2,10 @@
 #define TH_GENERIC_FILE "generic/THTensorMath.c"
 #else
 
+#ifndef NAN
+  #define NAN (nan(NULL))
+#endif
+
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -466,6 +470,31 @@ void THTensor_(scatter)(THTensor *tensor, int dim, THLongTensor *index, THTensor
                        })
 }
 
+void THTensor_(scatterAdd)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src)
+{
+  long elems_per_row, i, idx;
+
+  THArgCheck(dim < THTensor_(nDimension)(tensor), 2, "Index dimension is out of bounds");
+  THArgCheck(THLongTensor_nDimension(index) == THTensor_(nDimension)(tensor), 3,
+             "Index tensor must have same dimensions as output tensor");
+  THArgCheck(THTensor_(nDimension)(src) == THTensor_(nDimension)(tensor), 4,
+             "Input tensor must have same dimensions as output tensor");
+
+  elems_per_row = THLongTensor_size(index, dim);
+
+  TH_TENSOR_DIM_APPLY3(real, tensor, real, src, long, index, dim,
+                       for (i = 0; i < elems_per_row; ++i)
+                       {
+                         idx = *(index_data + i*index_stride);
+                         if (idx < TH_INDEX_BASE || idx >= tensor_size + TH_INDEX_BASE)
+                         {
+                           THFree(TH_TENSOR_DIM_APPLY_counter);
+                           THError("Invalid index in scatterAdd");
+                         }
+                         tensor_data[(idx - TH_INDEX_BASE) * tensor_stride] += *(src_data + i*src_stride);
+                       })
+}
+
 void THTensor_(scatterFill)(THTensor *tensor, int dim, THLongTensor *index, real val)
 {
   long elems_per_row, i, idx;
diff --git a/lib/TH/generic/THTensorMath.h b/lib/TH/generic/THTensorMath.h
index a3cf4107e..bacc9df7f 100644
--- a/lib/TH/generic/THTensorMath.h
+++ b/lib/TH/generic/THTensorMath.h
@@ -18,6 +18,7 @@ TH_API void THTensor_(indexFill)(THTensor *tensor, int dim, THLongTensor *index,
 
 TH_API void THTensor_(gather)(THTensor *tensor, THTensor *src, int dim, THLongTensor *index);
 TH_API void THTensor_(scatter)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src);
+TH_API void THTensor_(scatterAdd)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src);
 TH_API void THTensor_(scatterFill)(THTensor *tensor, int dim, THLongTensor *index, real val);
 
 TH_API accreal THTensor_(dot)(THTensor *t, THTensor *src);
diff --git a/lib/luaT/CMakeLists.txt b/lib/luaT/CMakeLists.txt
index f33768c70..072991cb1 100644
--- a/lib/luaT/CMakeLists.txt
+++ b/lib/luaT/CMakeLists.txt
@@ -9,9 +9,14 @@ IF(LUALIB)
 ENDIF()
 
 ADD_LIBRARY(luaT SHARED luaT.h luaT.c)
-if(BUILD_STATIC)
+
+IF (BUILD_STATIC OR "$ENV{STATIC_TH}" STREQUAL "YES")
   ADD_LIBRARY(luaT_static STATIC luaT.h luaT.c)
-endif()
+  SET_TARGET_PROPERTIES(luaT_static PROPERTIES
+    COMPILE_FLAGS "-fPIC")
+  SET_TARGET_PROPERTIES(luaT_static PROPERTIES
+    PREFIX "lib" IMPORT_PREFIX "lib" OUTPUT_NAME "luaT")
+ENDIF()
 
 SET_TARGET_PROPERTIES(luaT PROPERTIES
   VERSION   0
@@ -41,5 +46,5 @@ GET_FILENAME_COMPONENT(LUAT_OUTPUT_NAME ${LUAT_OUTPUT_NAME} NAME)
 SET(LUAT_LIBRARIES "${Torch_INSTALL_LIB}/${LUAT_OUTPUT_NAME}")
 SET(LUAT_INCLUDE_DIR "${Torch_INSTALL_INCLUDE}")
 CONFIGURE_FILE(luaTConfig.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/luaTConfig.cmake")
-INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/luaTConfig.cmake" 
+INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/luaTConfig.cmake"
   DESTINATION "${Torch_INSTALL_CMAKE_SUBDIR}")

From a1f5c11638279593e10f48f884908b8925ff4953 Mon Sep 17 00:00:00 2001
From: Eric Cosatto <cosatto@nec-labs.com>
Date: Tue, 2 Jan 2018 15:18:15 -0500
Subject: [PATCH 71/71] add b32/b64 modes to torch.save

Those modes are already supported by torch.load.
---
 File.lua | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/File.lua b/File.lua
index 62249a361..8ef9c71a5 100644
--- a/File.lua
+++ b/File.lua
@@ -376,15 +376,21 @@ function File:readObject()
    end
 end
 
--- simple helpers to save/load arbitrary objects/tables
+-- simple helpers to save/load arbitrary objects/tables 
 function torch.save(filename, object, mode, referenced)
-   assert(mode == nil or mode == 'binary' or mode == 'ascii', '"binary" or "ascii" (or nil) expected for mode')
+   assert(mode == nil or mode == 'binary' or mode == 'b32' or mode == 'b64' or mode == 'ascii', '"binary" or "ascii" (or nil) expected for mode')
    assert(referenced == nil or referenced == true or referenced == false, 'true or false (or nil) expected for referenced')
+   local longSize
+   if mode == 'b32' or mode == 'b64' then
+      longSize = tonumber(mode:match('%d+')) / 8
+      mode = 'binary'
+   end
    mode = mode or 'binary'
    referenced = referenced == nil and true or referenced
    local file = torch.DiskFile(filename, 'w')
    file[mode](file)
    file:referenced(referenced)
+   if longSize then file:longSize(longSize) end
    file:writeObject(object)
    file:close()
 end